[WIP] Simplify list detection

This commit is contained in:
Johannes Zillmann 2017-03-11 13:42:09 +01:00
parent f8fecc4c1d
commit c6f592d3fc
11 changed files with 219 additions and 180 deletions

View File

@ -46,4 +46,19 @@ export function charCodeArray(string) {
charCodes.push(string.charCodeAt(i)); charCodes.push(string.charCodeAt(i));
} }
return charCodes; return charCodes;
} }
export function removeLeadingWhitespaces(string) {
while (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) {
string = string.substring(1, string.length);
}
return string;
}
export function isListItem(string) {
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
}
export function isNumberedListItem(string) {
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
}

View File

@ -22,3 +22,13 @@ export const UNCHANGED_ANNOTATION = new Annotation({
category: 'Unchanged', category: 'Unchanged',
color: 'brown' color: 'brown'
}) })
export const DETECTED_ANNOTATION = new Annotation({
category: 'Detected',
color: 'green'
});
export const MODIFIED_ANNOTATION = new Annotation({
category: 'Modified',
color: 'green'
});

View File

@ -5,9 +5,10 @@ import CompactLines from './transformations/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/DetectTOC.jsx' import DetectTOC from './transformations/DetectTOC.jsx'
import DetectListItems from './transformations/DetectListItems.jsx'
import GatherBlocks from './transformations/GatherBlocks.jsx' import GatherBlocks from './transformations/GatherBlocks.jsx'
import DetectLists from './transformations/DetectLists.jsx' import DetectListLevels from './transformations/DetectListLevels.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectHeadlines from './transformations/DetectHeadlines.jsx' import DetectHeadlines from './transformations/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx' // import DetectFormats from './transformations/DetectFormats.jsx'
@ -34,9 +35,10 @@ export default class AppState {
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new DetectTOC(), new DetectTOC(),
new DetectListItems(),
new GatherBlocks(), new GatherBlocks(),
new DetectLists(), new DetectListLevels(),
new DetectCodeBlocks(), new DetectCodeBlocks(),
new DetectHeadlines(), new DetectHeadlines(),

View File

@ -55,6 +55,8 @@ ElementType.initEnum({
} }
}, },
LIST: { LIST: {
mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return concatTextItems(block.textItems); return concatTextItems(block.textItems);
} }
@ -70,7 +72,6 @@ export function blockToText(block: TextItemBlock) {
if (!block.type) { if (!block.type) {
return concatTextItems(block.textItems); return concatTextItems(block.textItems);
} }
console.debug(block.type);
return block.type.toText(block); return block.type.toText(block);
} }

View File

@ -19,7 +19,7 @@ export class ParsedElements {
this.footnotes = options.footnotes; this.footnotes = options.footnotes;
} }
add(parsedElements:ParsedElements) { add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
} }

View File

@ -0,0 +1,59 @@
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../functions.jsx';
//Detect items starting with -, , etc...
export default class DetectListItems extends ToTextItemTransformation {
constructor() {
super("Detect List Items");
}
transform(parseResult:ParseResult) {
var foundListItems = 0;
var foundNumberedItems = 0;
parseResult.pages.forEach(page => {
const newTextItems = [];
page.items.forEach(textItem => {
newTextItems.push(textItem);
if (!textItem.type) {
var text = textItem.text;
if (isListItem(text)) {
foundListItems++
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
if (textWithDash === text) {
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST;
} else {
textItem.annotation = REMOVED_ANNOTATION;
newTextItems.push(new TextItem({
...textItem,
text: textWithDash,
annotation: ADDED_ANNOTATION,
type: ElementType.LIST
}));
}
} else if (isNumberedListItem(text)) {
foundNumberedItems++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST;
}
}
});
page.items = newTextItems;
});
return new ParseResult({
...parseResult,
messages: [
'Detected ' + foundListItems + ' plain list items.',
'Detected ' + foundNumberedItems + ' numbered list items.'
]
});
}
}

View File

@ -0,0 +1,58 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
// Cares for proper sub-item spacing/leveling
export default class DetectListLevels extends ToTextItemBlockTransformation {
constructor() {
super("Level Lists");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
var listBlocks = 0;
var modifiedBlocks = 0;
parseResult.pages.forEach(page => {
page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
var lastItemX;
var currentLevel = 0;
const xByLevel = {};
var modifiedBlock = false;
listBlock.textItems.forEach(textItem => {
const isListItem = true;
if (lastItemX && isListItem) {
if (textItem.x > lastItemX) {
currentLevel++;
xByLevel[textItem.x] = currentLevel;
} else if (textItem.x < lastItemX) {
currentLevel = xByLevel[textItem.x];
}
} else {
xByLevel[textItem.x] = 0;
}
if (currentLevel > 0) {
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
modifiedBlock = true;
}
lastItemX = textItem.x;
});
listBlocks++;
if (modifiedBlock) {
modifiedBlocks++;
listBlock.annotation = MODIFIED_ANNOTATION;
} else {
listBlock.annotation = UNCHANGED_ANNOTATION;
}
});
});
return new ParseResult({
...parseResult,
messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.']
});
}
}

View File

@ -1,142 +0,0 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectLists extends ToTextItemBlockTransformation {
constructor() {
super("Detect Lists");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundBlocks = 0;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
if (minX) {
const newBlocks = [];
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type) {
const combineResult = textCombiner.combine(block.textItems);
if (hasMoreThan2LineItems(combineResult.textItems)) {
block.annotation = REMOVED_ANNOTATION;
foundBlocks++;
var lastItemX;
var currentLevel = 0;
var xByLevel = {};
var itemsBeforeFirstLineItem = [];
var listBlockItems = [];
const pushLineItem = (originalItem, text, setLevel) => {
if (lastItemX && setLevel) {
if (originalItem.x > lastItemX) {
currentLevel++;
xByLevel[originalItem.x] = currentLevel;
} else if (originalItem.x < lastItemX) {
currentLevel = xByLevel[originalItem.x];
}
} else {
xByLevel[originalItem.x] = 0;
}
listBlockItems.push(new TextItem({
...originalItem,
text: ' '.repeat(currentLevel * 3) + text
}));
lastItemX = originalItem.x;
};
combineResult.textItems.forEach(lineItem => {
if (isPlainListItem(lineItem.text)) {
var text = lineItem.text;
text = text.substring(1, text.length).trim();
text = '- ' + text;
pushLineItem(lineItem, text, true);
} else if (isNumberedListItem(lineItem.text)) {
var numberedText = lineItem.text;
numberedText
pushLineItem(lineItem, numberedText, true);
} else {
if (lastItemX) {
pushLineItem(lineItem, lineItem.text, false);
} else {
itemsBeforeFirstLineItem.push(lineItem);
}
}
});
if (itemsBeforeFirstLineItem.length > 0) {
newBlocks.push(new TextItemBlock({
textItems: itemsBeforeFirstLineItem,
type: ElementType.PARAGRAPH,
annotation: ADDED_ANNOTATION
}));
}
//TODO display with whitespace pre support
newBlocks.push(new TextItemBlock({
textItems: listBlockItems,
type: ElementType.LIST,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
}));
}
}
});
page.items = newBlocks;
}
});
return new ParseResult({
...parseResult,
messages: ['Detected ' + foundBlocks + ' list blocks.']
});
}
}
function hasMoreThan2LineItems(textItems:TextItem[]) {
var numberOfListItemLineStarts = 0;
for ( let item of textItems ) {
if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
numberOfListItemLineStarts++;
if (numberOfListItemLineStarts == 2) {
return true;
}
}
}
return false;
}
function isPlainListItem(string) {
if (string.startsWith('-')) {
return true;
}
if (string.startsWith('•')) {
return true;
}
return false;
}
function isNumberedListItem(string) {
if (!isNaN(parseInt(string.charAt(0)))) {
return true;
}
return false;
}

View File

@ -1,7 +1,7 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx'; import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx'; import TextItemBlock from '../TextItemBlock.jsx';
import { ADDED_ANNOTATION } from '../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../Annotation.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx'; import { minXFromTextItems } from '../../textItemFunctions.jsx';
// Gathers lines to blocks // Gathers lines to blocks
@ -21,7 +21,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
var stashedBlock = new TextItemBlock({}); var stashedBlock = new TextItemBlock({});
const flushStashedItems = () => { const flushStashedItems = () => {
if (stashedBlock.textItems.length > 1) { if (stashedBlock.textItems.length > 1) {
stashedBlock.annotation = ADDED_ANNOTATION; stashedBlock.annotation = DETECTED_ANNOTATION;
} }
blocks.push(stashedBlock); blocks.push(stashedBlock);
@ -54,19 +54,23 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) { if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
return false; return false;
} }
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
return false;
}
if (item.type !== stashedBlock.type) { if (item.type !== stashedBlock.type) {
return true; return true;
} }
if (item.type) { if (item.type) {
return !item.type.mergeToBlock; return !item.type.mergeToBlock;
} else { } else {
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1]; return hasBigDistance;
return shouldSplit(lastItem, item, minX, mostUsedDistance);
} }
} }
function shouldSplit(lastItem, item, minX, mostUsedDistance) { function bigDistance(lastItem, item, minX, mostUsedDistance) {
const distance = lastItem.y - item.y; const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2) { if (distance < 0 - mostUsedDistance / 2) {
//distance is negative - and not only a bit //distance is negative - and not only a bit

View File

@ -1,27 +0,0 @@
import { expect } from 'chai';
import Headline from '../src/javascript/models/markdown/Headline';
describe('Headline', () => {
it('correct level 1 props', () => {
const headline = new Headline({
level: 1
});
expect(headline.level).to.equal(1);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('# Hello World');
});
it('correct level 2 props', () => {
const headline = new Headline({
level: 2
});
expect(headline.level).to.equal(2);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('## Hello World');
});
});

View File

@ -1,6 +1,6 @@
import { expect } from 'chai'; import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx' import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
describe('hasUpperCaseCharacterInMiddleOfWord', () => { describe('hasUpperCaseCharacterInMiddleOfWord', () => {
@ -38,6 +38,23 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
}); });
}); });
describe('removeLeadingWhitespaces', () => {
it('No Removes', () => {
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
expect(removeLeadingWhitespaces(". . ")).to.be.equal(". . ");
});
it('Removes', () => {
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
expect(removeLeadingWhitespaces(" . ")).to.be.equal(". ");
expect(removeLeadingWhitespaces(" . . ")).to.be.equal(". . ");
});
});
describe('charCodeArray', () => { describe('charCodeArray', () => {
it('Charcodes', () => { it('Charcodes', () => {
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46); expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
@ -76,3 +93,45 @@ describe('normalizedCharCodeArray', () => {
}); });
}); });
describe('isListItem', () => {
it('Match', () => {
expect(isListItem('- my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem('• my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true);
});
it('No Match', () => {
expect(isListItem('my text')).to.equal(false);
expect(isListItem('-my text')).to.equal(false);
expect(isListItem('•my text')).to.equal(false);
expect(isListItem(' -my text')).to.equal(false);
expect(isListItem('- my text -')).to.equal(false);
expect(isListItem('• my text •')).to.equal(false);
});
});
describe('isNumberedListItem', () => {
it('Match', () => {
expect(isNumberedListItem('1. my text')).to.equal(true);
expect(isNumberedListItem('2. my text')).to.equal(true);
expect(isNumberedListItem('23. my text')).to.equal(true);
expect(isNumberedListItem('23. my text')).to.equal(true);
expect(isNumberedListItem(' 23. my text')).to.equal(true);
expect(isNumberedListItem(' 23. my text')).to.equal(true);
});
it('No Match', () => {
expect(isNumberedListItem('1two')).to.equal(false);
expect(isNumberedListItem('1 two')).to.equal(false);
expect(isNumberedListItem('1.two')).to.equal(false);
});
});