mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
[WIP] Simplify list detection
This commit is contained in:
parent
f8fecc4c1d
commit
c6f592d3fc
@ -47,3 +47,18 @@ export function charCodeArray(string) {
|
||||
}
|
||||
return charCodes;
|
||||
}
|
||||
|
||||
export function removeLeadingWhitespaces(string) {
|
||||
while (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) {
|
||||
string = string.substring(1, string.length);
|
||||
}
|
||||
return string;
|
||||
}
|
||||
|
||||
export function isListItem(string) {
|
||||
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
|
||||
}
|
||||
|
||||
export function isNumberedListItem(string) {
|
||||
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
|
||||
}
|
||||
|
@ -22,3 +22,13 @@ export const UNCHANGED_ANNOTATION = new Annotation({
|
||||
category: 'Unchanged',
|
||||
color: 'brown'
|
||||
})
|
||||
|
||||
export const DETECTED_ANNOTATION = new Annotation({
|
||||
category: 'Detected',
|
||||
color: 'green'
|
||||
});
|
||||
|
||||
export const MODIFIED_ANNOTATION = new Annotation({
|
||||
category: 'Modified',
|
||||
color: 'green'
|
||||
});
|
@ -5,9 +5,10 @@ import CompactLines from './transformations/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/DetectListItems.jsx'
|
||||
|
||||
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
||||
import DetectLists from './transformations/DetectLists.jsx'
|
||||
import DetectListLevels from './transformations/DetectListLevels.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
@ -34,9 +35,10 @@ export default class AppState {
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
new DetectTOC(),
|
||||
new DetectListItems(),
|
||||
|
||||
new GatherBlocks(),
|
||||
new DetectLists(),
|
||||
new DetectListLevels(),
|
||||
new DetectCodeBlocks(),
|
||||
new DetectHeadlines(),
|
||||
|
||||
|
@ -55,6 +55,8 @@ ElementType.initEnum({
|
||||
}
|
||||
},
|
||||
LIST: {
|
||||
mergeToBlock: true,
|
||||
mergeFollowingNonTypedItemsWithSmallDistance: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
@ -70,7 +72,6 @@ export function blockToText(block: TextItemBlock) {
|
||||
if (!block.type) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
console.debug(block.type);
|
||||
return block.type.toText(block);
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,7 @@ export class ParsedElements {
|
||||
this.footnotes = options.footnotes;
|
||||
}
|
||||
|
||||
add(parsedElements:ParsedElements) {
|
||||
add(parsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
}
|
||||
|
59
src/javascript/models/transformations/DetectListItems.jsx
Normal file
59
src/javascript/models/transformations/DetectListItems.jsx
Normal file
@ -0,0 +1,59 @@
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../functions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectListItems extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect List Items");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundListItems = 0;
|
||||
var foundNumberedItems = 0;
|
||||
parseResult.pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.items.forEach(textItem => {
|
||||
newTextItems.push(textItem);
|
||||
if (!textItem.type) {
|
||||
var text = textItem.text;
|
||||
if (isListItem(text)) {
|
||||
foundListItems++
|
||||
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||
if (textWithDash === text) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.LIST;
|
||||
} else {
|
||||
textItem.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(new TextItem({
|
||||
...textItem,
|
||||
text: textWithDash,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
type: ElementType.LIST
|
||||
}));
|
||||
}
|
||||
} else if (isNumberedListItem(text)) {
|
||||
foundNumberedItems++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.LIST;
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newTextItems;
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Detected ' + foundListItems + ' plain list items.',
|
||||
'Detected ' + foundNumberedItems + ' numbered list items.'
|
||||
]
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
58
src/javascript/models/transformations/DetectListLevels.jsx
Normal file
58
src/javascript/models/transformations/DetectListLevels.jsx
Normal file
@ -0,0 +1,58 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
|
||||
// Cares for proper sub-item spacing/leveling
|
||||
export default class DetectListLevels extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Level Lists");
|
||||
this.showWhitespaces = true;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var listBlocks = 0;
|
||||
var modifiedBlocks = 0;
|
||||
parseResult.pages.forEach(page => {
|
||||
|
||||
page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
|
||||
var lastItemX;
|
||||
var currentLevel = 0;
|
||||
const xByLevel = {};
|
||||
var modifiedBlock = false;
|
||||
listBlock.textItems.forEach(textItem => {
|
||||
const isListItem = true;
|
||||
if (lastItemX && isListItem) {
|
||||
if (textItem.x > lastItemX) {
|
||||
currentLevel++;
|
||||
xByLevel[textItem.x] = currentLevel;
|
||||
} else if (textItem.x < lastItemX) {
|
||||
currentLevel = xByLevel[textItem.x];
|
||||
}
|
||||
} else {
|
||||
xByLevel[textItem.x] = 0;
|
||||
}
|
||||
if (currentLevel > 0) {
|
||||
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
|
||||
modifiedBlock = true;
|
||||
}
|
||||
lastItemX = textItem.x;
|
||||
});
|
||||
listBlocks++;
|
||||
if (modifiedBlock) {
|
||||
modifiedBlocks++;
|
||||
listBlock.annotation = MODIFIED_ANNOTATION;
|
||||
} else {
|
||||
listBlock.annotation = UNCHANGED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.']
|
||||
});
|
||||
|
||||
}
|
||||
}
|
@ -1,142 +0,0 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectLists extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Lists");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
if (minX) {
|
||||
const newBlocks = [];
|
||||
page.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (hasMoreThan2LineItems(combineResult.textItems)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
foundBlocks++;
|
||||
|
||||
var lastItemX;
|
||||
var currentLevel = 0;
|
||||
var xByLevel = {};
|
||||
var itemsBeforeFirstLineItem = [];
|
||||
var listBlockItems = [];
|
||||
|
||||
const pushLineItem = (originalItem, text, setLevel) => {
|
||||
if (lastItemX && setLevel) {
|
||||
if (originalItem.x > lastItemX) {
|
||||
currentLevel++;
|
||||
xByLevel[originalItem.x] = currentLevel;
|
||||
} else if (originalItem.x < lastItemX) {
|
||||
currentLevel = xByLevel[originalItem.x];
|
||||
}
|
||||
} else {
|
||||
xByLevel[originalItem.x] = 0;
|
||||
}
|
||||
|
||||
|
||||
listBlockItems.push(new TextItem({
|
||||
...originalItem,
|
||||
text: ' '.repeat(currentLevel * 3) + text
|
||||
}));
|
||||
lastItemX = originalItem.x;
|
||||
|
||||
};
|
||||
|
||||
combineResult.textItems.forEach(lineItem => {
|
||||
if (isPlainListItem(lineItem.text)) {
|
||||
var text = lineItem.text;
|
||||
text = text.substring(1, text.length).trim();
|
||||
text = '- ' + text;
|
||||
pushLineItem(lineItem, text, true);
|
||||
|
||||
} else if (isNumberedListItem(lineItem.text)) {
|
||||
var numberedText = lineItem.text;
|
||||
numberedText
|
||||
pushLineItem(lineItem, numberedText, true);
|
||||
} else {
|
||||
if (lastItemX) {
|
||||
pushLineItem(lineItem, lineItem.text, false);
|
||||
} else {
|
||||
itemsBeforeFirstLineItem.push(lineItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (itemsBeforeFirstLineItem.length > 0) {
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: itemsBeforeFirstLineItem,
|
||||
type: ElementType.PARAGRAPH,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
//TODO display with whitespace pre support
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: listBlockItems,
|
||||
type: ElementType.LIST,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
}));
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Detected ' + foundBlocks + ' list blocks.']
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function hasMoreThan2LineItems(textItems:TextItem[]) {
|
||||
var numberOfListItemLineStarts = 0;
|
||||
for ( let item of textItems ) {
|
||||
if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
|
||||
numberOfListItemLineStarts++;
|
||||
if (numberOfListItemLineStarts == 2) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isPlainListItem(string) {
|
||||
if (string.startsWith('-')) {
|
||||
return true;
|
||||
}
|
||||
if (string.startsWith('•')) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isNumberedListItem(string) {
|
||||
if (!isNaN(parseInt(string.charAt(0)))) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import { ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { minXFromTextItems } from '../../textItemFunctions.jsx';
|
||||
|
||||
// Gathers lines to blocks
|
||||
@ -21,7 +21,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
|
||||
var stashedBlock = new TextItemBlock({});
|
||||
const flushStashedItems = () => {
|
||||
if (stashedBlock.textItems.length > 1) {
|
||||
stashedBlock.annotation = ADDED_ANNOTATION;
|
||||
stashedBlock.annotation = DETECTED_ANNOTATION;
|
||||
}
|
||||
|
||||
blocks.push(stashedBlock);
|
||||
@ -54,19 +54,23 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
||||
return false;
|
||||
}
|
||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
|
||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
|
||||
return false;
|
||||
}
|
||||
if (item.type !== stashedBlock.type) {
|
||||
return true;
|
||||
}
|
||||
if (item.type) {
|
||||
return !item.type.mergeToBlock;
|
||||
} else {
|
||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||
return shouldSplit(lastItem, item, minX, mostUsedDistance);
|
||||
return hasBigDistance;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
|
||||
function bigDistance(lastItem, item, minX, mostUsedDistance) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2) {
|
||||
//distance is negative - and not only a bit
|
||||
|
@ -1,27 +0,0 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import Headline from '../src/javascript/models/markdown/Headline';
|
||||
|
||||
describe('Headline', () => {
|
||||
|
||||
it('correct level 1 props', () => {
|
||||
const headline = new Headline({
|
||||
level: 1
|
||||
});
|
||||
expect(headline.level).to.equal(1);
|
||||
expect(headline.newLineBefore).to.equal(true);
|
||||
expect(headline.newLineAfter).to.equal(true);
|
||||
expect(headline.transformText('Hello World')).to.equal('# Hello World');
|
||||
});
|
||||
|
||||
it('correct level 2 props', () => {
|
||||
const headline = new Headline({
|
||||
level: 2
|
||||
});
|
||||
expect(headline.level).to.equal(2);
|
||||
expect(headline.newLineBefore).to.equal(true);
|
||||
expect(headline.newLineAfter).to.equal(true);
|
||||
expect(headline.transformText('Hello World')).to.equal('## Hello World');
|
||||
});
|
||||
|
||||
});
|
@ -1,6 +1,6 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
|
||||
|
||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
|
||||
@ -38,6 +38,23 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('removeLeadingWhitespaces', () => {
|
||||
it('No Removes', () => {
|
||||
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
|
||||
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
|
||||
expect(removeLeadingWhitespaces(". . ")).to.be.equal(". . ");
|
||||
});
|
||||
|
||||
it('Removes', () => {
|
||||
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
|
||||
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
|
||||
expect(removeLeadingWhitespaces(" . ")).to.be.equal(". ");
|
||||
expect(removeLeadingWhitespaces(" . . ")).to.be.equal(". . ");
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
describe('charCodeArray', () => {
|
||||
it('Charcodes', () => {
|
||||
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
|
||||
@ -76,3 +93,45 @@ describe('normalizedCharCodeArray', () => {
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe('isListItem', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(isListItem('- my text')).to.equal(true);
|
||||
expect(isListItem(' - my text')).to.equal(true);
|
||||
expect(isListItem(' - my text')).to.equal(true);
|
||||
|
||||
expect(isListItem('• my text')).to.equal(true);
|
||||
expect(isListItem(' • my text')).to.equal(true);
|
||||
expect(isListItem(' • my text')).to.equal(true);
|
||||
});
|
||||
|
||||
it('No Match', () => {
|
||||
expect(isListItem('my text')).to.equal(false);
|
||||
expect(isListItem('-my text')).to.equal(false);
|
||||
expect(isListItem('•my text')).to.equal(false);
|
||||
expect(isListItem(' -my text')).to.equal(false);
|
||||
expect(isListItem('- my text -')).to.equal(false);
|
||||
expect(isListItem('• my text •')).to.equal(false);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe('isNumberedListItem', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(isNumberedListItem('1. my text')).to.equal(true);
|
||||
expect(isNumberedListItem('2. my text')).to.equal(true);
|
||||
expect(isNumberedListItem('23. my text')).to.equal(true);
|
||||
expect(isNumberedListItem('23. my text')).to.equal(true);
|
||||
expect(isNumberedListItem(' 23. my text')).to.equal(true);
|
||||
expect(isNumberedListItem(' 23. my text')).to.equal(true);
|
||||
});
|
||||
|
||||
it('No Match', () => {
|
||||
expect(isNumberedListItem('1two')).to.equal(false);
|
||||
expect(isNumberedListItem('1 two')).to.equal(false);
|
||||
expect(isNumberedListItem('1.two')).to.equal(false);
|
||||
});
|
||||
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user