[WIP] Simplify list detection

This commit is contained in:
Johannes Zillmann 2017-03-11 13:42:09 +01:00
parent f8fecc4c1d
commit c6f592d3fc
11 changed files with 219 additions and 180 deletions

View File

@ -46,4 +46,19 @@ export function charCodeArray(string) {
charCodes.push(string.charCodeAt(i));
}
return charCodes;
}
}
export function removeLeadingWhitespaces(string) {
while (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) {
string = string.substring(1, string.length);
}
return string;
}
export function isListItem(string) {
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
}
export function isNumberedListItem(string) {
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
}

View File

@ -22,3 +22,13 @@ export const UNCHANGED_ANNOTATION = new Annotation({
category: 'Unchanged',
color: 'brown'
})
export const DETECTED_ANNOTATION = new Annotation({
category: 'Detected',
color: 'green'
});
export const MODIFIED_ANNOTATION = new Annotation({
category: 'Modified',
color: 'green'
});

View File

@ -5,9 +5,10 @@ import CompactLines from './transformations/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectListItems from './transformations/DetectListItems.jsx'
import GatherBlocks from './transformations/GatherBlocks.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectListLevels from './transformations/DetectListLevels.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
@ -34,9 +35,10 @@ export default class AppState {
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectTOC(),
new DetectListItems(),
new GatherBlocks(),
new DetectLists(),
new DetectListLevels(),
new DetectCodeBlocks(),
new DetectHeadlines(),

View File

@ -55,6 +55,8 @@ ElementType.initEnum({
}
},
LIST: {
mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true,
toText(block:TextItemBlock) {
return concatTextItems(block.textItems);
}
@ -70,7 +72,6 @@ export function blockToText(block: TextItemBlock) {
if (!block.type) {
return concatTextItems(block.textItems);
}
console.debug(block.type);
return block.type.toText(block);
}

View File

@ -19,7 +19,7 @@ export class ParsedElements {
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}

View File

@ -0,0 +1,59 @@
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../functions.jsx';
//Detect items starting with -, , etc...
export default class DetectListItems extends ToTextItemTransformation {
constructor() {
super("Detect List Items");
}
transform(parseResult:ParseResult) {
var foundListItems = 0;
var foundNumberedItems = 0;
parseResult.pages.forEach(page => {
const newTextItems = [];
page.items.forEach(textItem => {
newTextItems.push(textItem);
if (!textItem.type) {
var text = textItem.text;
if (isListItem(text)) {
foundListItems++
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
if (textWithDash === text) {
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST;
} else {
textItem.annotation = REMOVED_ANNOTATION;
newTextItems.push(new TextItem({
...textItem,
text: textWithDash,
annotation: ADDED_ANNOTATION,
type: ElementType.LIST
}));
}
} else if (isNumberedListItem(text)) {
foundNumberedItems++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST;
}
}
});
page.items = newTextItems;
});
return new ParseResult({
...parseResult,
messages: [
'Detected ' + foundListItems + ' plain list items.',
'Detected ' + foundNumberedItems + ' numbered list items.'
]
});
}
}

View File

@ -0,0 +1,58 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
// Cares for proper sub-item spacing/leveling
export default class DetectListLevels extends ToTextItemBlockTransformation {
constructor() {
super("Level Lists");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
var listBlocks = 0;
var modifiedBlocks = 0;
parseResult.pages.forEach(page => {
page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
var lastItemX;
var currentLevel = 0;
const xByLevel = {};
var modifiedBlock = false;
listBlock.textItems.forEach(textItem => {
const isListItem = true;
if (lastItemX && isListItem) {
if (textItem.x > lastItemX) {
currentLevel++;
xByLevel[textItem.x] = currentLevel;
} else if (textItem.x < lastItemX) {
currentLevel = xByLevel[textItem.x];
}
} else {
xByLevel[textItem.x] = 0;
}
if (currentLevel > 0) {
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
modifiedBlock = true;
}
lastItemX = textItem.x;
});
listBlocks++;
if (modifiedBlock) {
modifiedBlocks++;
listBlock.annotation = MODIFIED_ANNOTATION;
} else {
listBlock.annotation = UNCHANGED_ANNOTATION;
}
});
});
return new ParseResult({
...parseResult,
messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.']
});
}
}

View File

@ -1,142 +0,0 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectLists extends ToTextItemBlockTransformation {
constructor() {
super("Detect Lists");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundBlocks = 0;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
if (minX) {
const newBlocks = [];
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type) {
const combineResult = textCombiner.combine(block.textItems);
if (hasMoreThan2LineItems(combineResult.textItems)) {
block.annotation = REMOVED_ANNOTATION;
foundBlocks++;
var lastItemX;
var currentLevel = 0;
var xByLevel = {};
var itemsBeforeFirstLineItem = [];
var listBlockItems = [];
const pushLineItem = (originalItem, text, setLevel) => {
if (lastItemX && setLevel) {
if (originalItem.x > lastItemX) {
currentLevel++;
xByLevel[originalItem.x] = currentLevel;
} else if (originalItem.x < lastItemX) {
currentLevel = xByLevel[originalItem.x];
}
} else {
xByLevel[originalItem.x] = 0;
}
listBlockItems.push(new TextItem({
...originalItem,
text: ' '.repeat(currentLevel * 3) + text
}));
lastItemX = originalItem.x;
};
combineResult.textItems.forEach(lineItem => {
if (isPlainListItem(lineItem.text)) {
var text = lineItem.text;
text = text.substring(1, text.length).trim();
text = '- ' + text;
pushLineItem(lineItem, text, true);
} else if (isNumberedListItem(lineItem.text)) {
var numberedText = lineItem.text;
numberedText
pushLineItem(lineItem, numberedText, true);
} else {
if (lastItemX) {
pushLineItem(lineItem, lineItem.text, false);
} else {
itemsBeforeFirstLineItem.push(lineItem);
}
}
});
if (itemsBeforeFirstLineItem.length > 0) {
newBlocks.push(new TextItemBlock({
textItems: itemsBeforeFirstLineItem,
type: ElementType.PARAGRAPH,
annotation: ADDED_ANNOTATION
}));
}
//TODO display with whitespace pre support
newBlocks.push(new TextItemBlock({
textItems: listBlockItems,
type: ElementType.LIST,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
}));
}
}
});
page.items = newBlocks;
}
});
return new ParseResult({
...parseResult,
messages: ['Detected ' + foundBlocks + ' list blocks.']
});
}
}
function hasMoreThan2LineItems(textItems:TextItem[]) {
var numberOfListItemLineStarts = 0;
for ( let item of textItems ) {
if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
numberOfListItemLineStarts++;
if (numberOfListItemLineStarts == 2) {
return true;
}
}
}
return false;
}
function isPlainListItem(string) {
if (string.startsWith('-')) {
return true;
}
if (string.startsWith('•')) {
return true;
}
return false;
}
function isNumberedListItem(string) {
if (!isNaN(parseInt(string.charAt(0)))) {
return true;
}
return false;
}

View File

@ -1,7 +1,7 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import { ADDED_ANNOTATION } from '../Annotation.jsx';
import { DETECTED_ANNOTATION } from '../Annotation.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
// Gathers lines to blocks
@ -21,7 +21,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
var stashedBlock = new TextItemBlock({});
const flushStashedItems = () => {
if (stashedBlock.textItems.length > 1) {
stashedBlock.annotation = ADDED_ANNOTATION;
stashedBlock.annotation = DETECTED_ANNOTATION;
}
blocks.push(stashedBlock);
@ -54,19 +54,23 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
return false;
}
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
return false;
}
if (item.type !== stashedBlock.type) {
return true;
}
if (item.type) {
return !item.type.mergeToBlock;
} else {
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
return shouldSplit(lastItem, item, minX, mostUsedDistance);
return hasBigDistance;
}
}
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
function bigDistance(lastItem, item, minX, mostUsedDistance) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2) {
//distance is negative - and not only a bit

View File

@ -1,27 +0,0 @@
import { expect } from 'chai';
import Headline from '../src/javascript/models/markdown/Headline';
describe('Headline', () => {
it('correct level 1 props', () => {
const headline = new Headline({
level: 1
});
expect(headline.level).to.equal(1);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('# Hello World');
});
it('correct level 2 props', () => {
const headline = new Headline({
level: 2
});
expect(headline.level).to.equal(2);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('## Hello World');
});
});

View File

@ -1,6 +1,6 @@
import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
@ -38,6 +38,23 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
});
});
describe('removeLeadingWhitespaces', () => {
it('No Removes', () => {
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
expect(removeLeadingWhitespaces(". . ")).to.be.equal(". . ");
});
it('Removes', () => {
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
expect(removeLeadingWhitespaces(" . ")).to.be.equal(". ");
expect(removeLeadingWhitespaces(" . . ")).to.be.equal(". . ");
});
});
describe('charCodeArray', () => {
it('Charcodes', () => {
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
@ -76,3 +93,45 @@ describe('normalizedCharCodeArray', () => {
});
});
describe('isListItem', () => {
it('Match', () => {
expect(isListItem('- my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem('• my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true);
});
it('No Match', () => {
expect(isListItem('my text')).to.equal(false);
expect(isListItem('-my text')).to.equal(false);
expect(isListItem('•my text')).to.equal(false);
expect(isListItem(' -my text')).to.equal(false);
expect(isListItem('- my text -')).to.equal(false);
expect(isListItem('• my text •')).to.equal(false);
});
});
describe('isNumberedListItem', () => {
it('Match', () => {
expect(isNumberedListItem('1. my text')).to.equal(true);
expect(isNumberedListItem('2. my text')).to.equal(true);
expect(isNumberedListItem('23. my text')).to.equal(true);
expect(isNumberedListItem('23. my text')).to.equal(true);
expect(isNumberedListItem(' 23. my text')).to.equal(true);
expect(isNumberedListItem(' 23. my text')).to.equal(true);
});
it('No Match', () => {
expect(isNumberedListItem('1two')).to.equal(false);
expect(isNumberedListItem('1 two')).to.equal(false);
expect(isNumberedListItem('1.two')).to.equal(false);
});
});