mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
Improve list detection
* Add ‘ ‘ on compact lines when line starts with list character * Add – as list character * rename functions.jsx to stringFunctions.jsx
This commit is contained in:
parent
106e2bfa8e
commit
c4679238cd
@ -1,4 +1,4 @@
|
|||||||
import { normalizedCharCodeArray } from '../functions.jsx'
|
import { normalizedCharCodeArray } from '../stringFunctions.jsx'
|
||||||
|
|
||||||
export default class HeadlineFinder {
|
export default class HeadlineFinder {
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import WordType from './markdown/WordType.jsx';
|
|||||||
import LineItem from './LineItem.jsx';
|
import LineItem from './LineItem.jsx';
|
||||||
import StashingStream from './StashingStream.jsx';
|
import StashingStream from './StashingStream.jsx';
|
||||||
import { ParsedElements } from './PageItem.jsx';
|
import { ParsedElements } from './PageItem.jsx';
|
||||||
import { isNumber } from '../functions.jsx'
|
import { isNumber, isListItemCharacter } from '../stringFunctions.jsx'
|
||||||
import { sortByX } from '../pageItemFunctions.jsx'
|
import { sortByX } from '../pageItemFunctions.jsx'
|
||||||
|
|
||||||
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
|
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
|
||||||
@ -146,13 +146,20 @@ function combineText(textItems) {
|
|||||||
var text = '';
|
var text = '';
|
||||||
var lastItem;
|
var lastItem;
|
||||||
textItems.forEach(textItem => {
|
textItems.forEach(textItem => {
|
||||||
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
|
var textToAdd = textItem.text;
|
||||||
const xDistance = textItem.x - lastItem.x - lastItem.width;
|
if (!text.endsWith(' ') && !textToAdd.startsWith(' ')) {
|
||||||
if (xDistance > 5) {
|
if (lastItem) {
|
||||||
text += ' ';
|
const xDistance = textItem.x - lastItem.x - lastItem.width;
|
||||||
|
if (xDistance > 5) {
|
||||||
|
text += ' ';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (isListItemCharacter(textItem.text)) {
|
||||||
|
textToAdd += ' ';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
text += textItem.text;
|
text += textToAdd;
|
||||||
lastItem = textItem;
|
lastItem = textItem;
|
||||||
});
|
});
|
||||||
return text;
|
return text;
|
||||||
|
@ -3,9 +3,9 @@ import ParseResult from '../../ParseResult.jsx';
|
|||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../ElementType.jsx';
|
||||||
import { isListItem } from '../../../functions.jsx';
|
import { isListItem } from '../../../stringFunctions.jsx';
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect headlines based on heights
|
||||||
export default class DetectHeaders extends ToLineItemTransformation {
|
export default class DetectHeaders extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import LineItem from '../../LineItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
|
import Word from '../../Word.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx';
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect items starting with -, •, etc...
|
||||||
export default class DetectListItems extends ToLineItemTransformation {
|
export default class DetectListItems extends ToLineItemTransformation {
|
||||||
@ -21,17 +22,20 @@ export default class DetectListItems extends ToLineItemTransformation {
|
|||||||
newItems.push(item);
|
newItems.push(item);
|
||||||
if (!item.type) {
|
if (!item.type) {
|
||||||
var text = item.text();
|
var text = item.text();
|
||||||
if (isListItem(text)) {
|
if (isListItemCharacter(item.words[0].string)) {
|
||||||
foundListItems++
|
foundListItems++
|
||||||
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
if (item.words[0].string === '-') {
|
||||||
if (textWithDash === text) {
|
|
||||||
item.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
item.type = ElementType.LIST;
|
item.type = ElementType.LIST;
|
||||||
} else {
|
} else {
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
|
const newWords = item.words.map(word => new Word({
|
||||||
|
...word
|
||||||
|
}));
|
||||||
|
newWords[0].string = '-';
|
||||||
newItems.push(new LineItem({
|
newItems.push(new LineItem({
|
||||||
...item,
|
...item,
|
||||||
text: textWithDash,
|
words: newWords,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
type: ElementType.LIST
|
type: ElementType.LIST
|
||||||
}));
|
}));
|
||||||
|
@ -6,7 +6,7 @@ import HeadlineFinder from '../../HeadlineFinder.jsx';
|
|||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../ElementType.jsx';
|
||||||
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx'
|
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages plus linked headlines
|
//Detect table of contents pages plus linked headlines
|
||||||
export default class DetectTOC extends ToLineItemTransformation {
|
export default class DetectTOC extends ToLineItemTransformation {
|
||||||
|
@ -2,7 +2,7 @@ import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
|||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
import { isDigit } from '../../../functions.jsx'
|
import { isDigit } from '../../../stringFunctions.jsx'
|
||||||
|
|
||||||
|
|
||||||
function hashCodeIgnoringSpacesAndNumbers(string) {
|
function hashCodeIgnoringSpacesAndNumbers(string) {
|
||||||
|
@ -92,8 +92,17 @@ export function suffixBeforeWhitespace(string, suffix) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function isListItemCharacter(string) {
|
||||||
|
if (string.length > 1) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
const char = string.charAt(0);
|
||||||
|
return char === '-' || char === '•' || char === '–';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export function isListItem(string) {
|
export function isListItem(string) {
|
||||||
return /^[\s]*[-•][\s].*$/g.test(string);
|
return /^[\s]*[-•–][\s].*$/g.test(string);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isNumberedListItem(string) {
|
export function isNumberedListItem(string) {
|
@ -1,6 +1,6 @@
|
|||||||
import { expect } from 'chai';
|
import { expect } from 'chai';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/stringFunctions.jsx'
|
||||||
|
|
||||||
describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
|
describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||||
|
|
||||||
@ -144,6 +144,9 @@ describe('functions: isListItem', () => {
|
|||||||
expect(isListItem('• my text')).to.equal(true);
|
expect(isListItem('• my text')).to.equal(true);
|
||||||
expect(isListItem(' • my text')).to.equal(true);
|
expect(isListItem(' • my text')).to.equal(true);
|
||||||
expect(isListItem(' • my text')).to.equal(true);
|
expect(isListItem(' • my text')).to.equal(true);
|
||||||
|
|
||||||
|
expect(isListItem('– my text')).to.equal(true);
|
||||||
|
expect(isListItem(' – my text')).to.equal(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('No Match', () => {
|
it('No Match', () => {
|
Loading…
Reference in New Issue
Block a user