[WIP] Simplify list detection

2025-06-20 17:47:47 +02:00 · 2017-03-11 13:42:09 +01:00 · 2017-03-11 13:42:09 +01:00 · c6f592d3fc
commit c6f592d3fc
parent f8fecc4c1d
11 changed files with 219 additions and 180 deletions
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@ -46,4 +46,19 @@ export function charCodeArray(string) {
        charCodes.push(string.charCodeAt(i));
    }
    return charCodes;
-}
+}
+
+export function removeLeadingWhitespaces(string) {
+    while (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) {
+        string = string.substring(1, string.length);
+    }
+    return string;
+}
+
+export function isListItem(string) {
+    return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
+}
+
+export function isNumberedListItem(string) {
+    return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
+}
--- a/src/javascript/models/Annotation.jsx
+++ b/src/javascript/models/Annotation.jsx
@ -22,3 +22,13 @@ export const UNCHANGED_ANNOTATION = new Annotation({
    category: 'Unchanged',
    color: 'brown'
 })
+
+export const DETECTED_ANNOTATION = new Annotation({
+    category: 'Detected',
+    color: 'green'
+});
+
+export const MODIFIED_ANNOTATION = new Annotation({
+    category: 'Modified',
+    color: 'green'
+});
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -5,9 +5,10 @@ import CompactLines from './transformations/CompactLines.jsx';
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
 import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
 import DetectTOC from './transformations/DetectTOC.jsx'
+import DetectListItems from './transformations/DetectListItems.jsx'

 import GatherBlocks from './transformations/GatherBlocks.jsx'
-import DetectLists from './transformations/DetectLists.jsx'
+import DetectListLevels from './transformations/DetectListLevels.jsx'
 import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
 import DetectHeadlines from './transformations/DetectHeadlines.jsx'
 // import DetectFormats from './transformations/DetectFormats.jsx'
@ -34,9 +35,10 @@ export default class AppState {
            new RemoveRepetitiveElements(),
            new VerticalToHorizontal(),
            new DetectTOC(),
+            new DetectListItems(),

            new GatherBlocks(),
-            new DetectLists(),
+            new DetectListLevels(),
            new DetectCodeBlocks(),
            new DetectHeadlines(),

--- a/src/javascript/models/ElementType.jsx
+++ b/src/javascript/models/ElementType.jsx
@ -55,6 +55,8 @@ ElementType.initEnum({
        }
    },
    LIST: {
+        mergeToBlock: true,
+        mergeFollowingNonTypedItemsWithSmallDistance: true,
        toText(block:TextItemBlock) {
            return concatTextItems(block.textItems);
        }
@ -70,7 +72,6 @@ export function blockToText(block: TextItemBlock) {
    if (!block.type) {
        return concatTextItems(block.textItems);
    }
-    console.debug(block.type);
    return block.type.toText(block);
 }

--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@ -19,7 +19,7 @@ export class ParsedElements {
        this.footnotes = options.footnotes;
    }

-    add(parsedElements:ParsedElements) {
+    add(parsedElements) {
        this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
        this.footnotes = this.footnotes.concat(parsedElements.footnotes);
    }
--- a/src/javascript/models/transformations/DetectListItems.jsx
+++ b/src/javascript/models/transformations/DetectListItems.jsx
@ -0,0 +1,59 @@
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import TextItem from '../TextItem.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
+import ElementType from '../ElementType.jsx';
+import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../functions.jsx';
+
+//Detect items starting with -, •, etc...
+export default class DetectListItems extends ToTextItemTransformation {
+
+    constructor() {
+        super("Detect List Items");
+    }
+
+    transform(parseResult:ParseResult) {
+        var foundListItems = 0;
+        var foundNumberedItems = 0;
+        parseResult.pages.forEach(page => {
+            const newTextItems = [];
+            page.items.forEach(textItem => {
+                newTextItems.push(textItem);
+                if (!textItem.type) {
+                    var text = textItem.text;
+                    if (isListItem(text)) {
+                        foundListItems++
+                        const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
+                        if (textWithDash === text) {
+                            textItem.annotation = DETECTED_ANNOTATION;
+                            textItem.type = ElementType.LIST;
+                        } else {
+                            textItem.annotation = REMOVED_ANNOTATION;
+                            newTextItems.push(new TextItem({
+                                ...textItem,
+                                text: textWithDash,
+                                annotation: ADDED_ANNOTATION,
+                                type: ElementType.LIST
+                            }));
+                        }
+                    } else if (isNumberedListItem(text)) {
+                        foundNumberedItems++;
+                        textItem.annotation = DETECTED_ANNOTATION;
+                        textItem.type = ElementType.LIST;
+                    }
+                }
+            });
+            page.items = newTextItems;
+        });
+
+        return new ParseResult({
+            ...parseResult,
+            messages: [
+                'Detected ' + foundListItems + ' plain list items.',
+                'Detected ' + foundNumberedItems + ' numbered list items.'
+            ]
+        });
+
+    }
+
+}
--- a/src/javascript/models/transformations/DetectListLevels.jsx
+++ b/src/javascript/models/transformations/DetectListLevels.jsx
@ -0,0 +1,58 @@
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
+import ElementType from '../ElementType.jsx';
+
+// Cares for proper sub-item spacing/leveling
+export default class DetectListLevels extends ToTextItemBlockTransformation {
+
+    constructor() {
+        super("Level Lists");
+        this.showWhitespaces = true;
+    }
+
+    transform(parseResult:ParseResult) {
+        var listBlocks = 0;
+        var modifiedBlocks = 0;
+        parseResult.pages.forEach(page => {
+
+            page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
+                var lastItemX;
+                var currentLevel = 0;
+                const xByLevel = {};
+                var modifiedBlock = false;
+                listBlock.textItems.forEach(textItem => {
+                    const isListItem = true;
+                    if (lastItemX && isListItem) {
+                        if (textItem.x > lastItemX) {
+                            currentLevel++;
+                            xByLevel[textItem.x] = currentLevel;
+                        } else if (textItem.x < lastItemX) {
+                            currentLevel = xByLevel[textItem.x];
+                        }
+                    } else {
+                        xByLevel[textItem.x] = 0;
+                    }
+                    if (currentLevel > 0) {
+                        textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
+                        modifiedBlock = true;
+                    }
+                    lastItemX = textItem.x;
+                });
+                listBlocks++;
+                if (modifiedBlock) {
+                    modifiedBlocks++;
+                    listBlock.annotation = MODIFIED_ANNOTATION;
+                } else {
+                    listBlock.annotation = UNCHANGED_ANNOTATION;
+                }
+            });
+
+        });
+        return new ParseResult({
+            ...parseResult,
+            messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.']
+        });
+
+    }
+}
--- a/src/javascript/models/transformations/DetectLists.jsx
+++ b/src/javascript/models/transformations/DetectLists.jsx
@ -1,142 +0,0 @@
-import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
-import ParseResult from '../ParseResult.jsx';
-import TextItem from '../TextItem.jsx';
-import TextItemBlock from '../TextItemBlock.jsx';
-import TextItemCombiner from '../TextItemCombiner.jsx';
-import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
-import ElementType from '../ElementType.jsx';
-import { minXFromBlocks } from '../../textItemFunctions.jsx';
-
-//Detect quotes, code etc.. which is transformed to markdown code syntax
-export default class DetectLists extends ToTextItemBlockTransformation {
-
-    constructor() {
-        super("Detect Lists");
-    }
-
-    transform(parseResult:ParseResult) {
-        const {mostUsedDistance} = parseResult.globals;
-        var foundBlocks = 0;
-        const textCombiner = new TextItemCombiner({
-            mostUsedDistance: mostUsedDistance
-        });
-
-        parseResult.pages.forEach(page => {
-            var minX = minXFromBlocks(page.items);
-            if (minX) {
-                const newBlocks = [];
-                page.items.forEach(block => {
-                    newBlocks.push(block);
-                    if (!block.type) {
-                        const combineResult = textCombiner.combine(block.textItems);
-                        if (hasMoreThan2LineItems(combineResult.textItems)) {
-                            block.annotation = REMOVED_ANNOTATION;
-                            foundBlocks++;
-
-                            var lastItemX;
-                            var currentLevel = 0;
-                            var xByLevel = {};
-                            var itemsBeforeFirstLineItem = [];
-                            var listBlockItems = [];
-
-                            const pushLineItem = (originalItem, text, setLevel) => {
-                                if (lastItemX && setLevel) {
-                                    if (originalItem.x > lastItemX) {
-                                        currentLevel++;
-                                        xByLevel[originalItem.x] = currentLevel;
-                                    } else if (originalItem.x < lastItemX) {
-                                        currentLevel = xByLevel[originalItem.x];
-                                    }
-                                } else {
-                                    xByLevel[originalItem.x] = 0;
-                                }
-
-
-                                listBlockItems.push(new TextItem({
-                                    ...originalItem,
-                                    text: ' '.repeat(currentLevel * 3) + text
-                                }));
-                                lastItemX = originalItem.x;
-
-                            };
-
-                            combineResult.textItems.forEach(lineItem => {
-                                if (isPlainListItem(lineItem.text)) {
-                                    var text = lineItem.text;
-                                    text = text.substring(1, text.length).trim();
-                                    text = '- ' + text;
-                                    pushLineItem(lineItem, text, true);
-
-                                } else if (isNumberedListItem(lineItem.text)) {
-                                    var numberedText = lineItem.text;
-                                    numberedText
-                                    pushLineItem(lineItem, numberedText, true);
-                                } else {
-                                    if (lastItemX) {
-                                        pushLineItem(lineItem, lineItem.text, false);
-                                    } else {
-                                        itemsBeforeFirstLineItem.push(lineItem);
-                                    }
-                                }
-                            });
-
-                            if (itemsBeforeFirstLineItem.length > 0) {
-                                newBlocks.push(new TextItemBlock({
-                                    textItems: itemsBeforeFirstLineItem,
-                                    type: ElementType.PARAGRAPH,
-                                    annotation: ADDED_ANNOTATION
-                                }));
-                            }
-                            //TODO display with whitespace pre support
-                            newBlocks.push(new TextItemBlock({
-                                textItems: listBlockItems,
-                                type: ElementType.LIST,
-                                annotation: ADDED_ANNOTATION,
-                                parsedElements: combineResult.parsedElements
-                            }));
-                        }
-                    }
-                });
-                page.items = newBlocks;
-            }
-        });
-
-        return new ParseResult({
-            ...parseResult,
-            messages: ['Detected ' + foundBlocks + ' list blocks.']
-        });
-
-    }
-
-}
-
-function hasMoreThan2LineItems(textItems:TextItem[]) {
-    var numberOfListItemLineStarts = 0;
-    for ( let item of textItems ) {
-        if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
-            numberOfListItemLineStarts++;
-            if (numberOfListItemLineStarts == 2) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-function isPlainListItem(string) {
-    if (string.startsWith('-')) {
-        return true;
-    }
-    if (string.startsWith('•')) {
-        return true;
-    }
-    return false;
-}
-
-function isNumberedListItem(string) {
-    if (!isNaN(parseInt(string.charAt(0)))) {
-        return true;
-    }
-    return false;
-}
-
--- a/src/javascript/models/transformations/GatherBlocks.jsx
+++ b/src/javascript/models/transformations/GatherBlocks.jsx
@ -1,7 +1,7 @@
 import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
 import ParseResult from '../ParseResult.jsx';
 import TextItemBlock from '../TextItemBlock.jsx';
-import { ADDED_ANNOTATION } from '../Annotation.jsx';
+import { DETECTED_ANNOTATION } from '../Annotation.jsx';
 import { minXFromTextItems } from '../../textItemFunctions.jsx';

 // Gathers lines to blocks
@ -21,7 +21,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
            var stashedBlock = new TextItemBlock({});
            const flushStashedItems = () => {
                if (stashedBlock.textItems.length > 1) {
-                    stashedBlock.annotation = ADDED_ANNOTATION;
+                    stashedBlock.annotation = DETECTED_ANNOTATION;
                }

                blocks.push(stashedBlock);
@ -54,19 +54,23 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
    if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
        return false;
    }
+    const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
+    const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
+    if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
+        return false;
+    }
    if (item.type !== stashedBlock.type) {
        return true;
    }
    if (item.type) {
        return !item.type.mergeToBlock;
    } else {
-        const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
-        return shouldSplit(lastItem, item, minX, mostUsedDistance);
+        return hasBigDistance;
    }
 }


-function shouldSplit(lastItem, item, minX, mostUsedDistance) {
+function bigDistance(lastItem, item, minX, mostUsedDistance) {
    const distance = lastItem.y - item.y;
    if (distance < 0 - mostUsedDistance / 2) {
        //distance is negative - and not only a bit
--- a/test/Headline.spec.js
+++ b/test/Headline.spec.js
@ -1,27 +0,0 @@
-import { expect } from 'chai';
-
-import Headline from '../src/javascript/models/markdown/Headline';
-
-describe('Headline', () => {
-
-    it('correct level 1 props', () => {
-        const headline = new Headline({
-            level: 1
-        });
-        expect(headline.level).to.equal(1);
-        expect(headline.newLineBefore).to.equal(true);
-        expect(headline.newLineAfter).to.equal(true);
-        expect(headline.transformText('Hello World')).to.equal('# Hello World');
-    });
-
-    it('correct level 2 props', () => {
-        const headline = new Headline({
-            level: 2
-        });
-        expect(headline.level).to.equal(2);
-        expect(headline.newLineBefore).to.equal(true);
-        expect(headline.newLineAfter).to.equal(true);
-        expect(headline.transformText('Hello World')).to.equal('## Hello World');
-    });
-
-});
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@ -1,6 +1,6 @@
 import { expect } from 'chai';

-import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'
+import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'

 describe('hasUpperCaseCharacterInMiddleOfWord', () => {

@ -38,6 +38,23 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
    });
 });

+describe('removeLeadingWhitespaces', () => {
+    it('No Removes', () => {
+        expect(removeLeadingWhitespaces(".")).to.be.equal(".");
+        expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
+        expect(removeLeadingWhitespaces(". . ")).to.be.equal(". . ");
+    });
+
+    it('Removes', () => {
+        expect(removeLeadingWhitespaces(" .")).to.be.equal(".");
+        expect(removeLeadingWhitespaces("  .")).to.be.equal(".");
+        expect(removeLeadingWhitespaces("  . ")).to.be.equal(". ");
+        expect(removeLeadingWhitespaces("  . . ")).to.be.equal(". . ");
+    });
+
+});
+
+
 describe('charCodeArray', () => {
    it('Charcodes', () => {
        expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
@ -76,3 +93,45 @@ describe('normalizedCharCodeArray', () => {
    });

 });
+
+describe('isListItem', () => {
+
+    it('Match', () => {
+        expect(isListItem('- my text')).to.equal(true);
+        expect(isListItem(' - my text')).to.equal(true);
+        expect(isListItem('  - my text')).to.equal(true);
+
+        expect(isListItem('• my text')).to.equal(true);
+        expect(isListItem(' • my text')).to.equal(true);
+        expect(isListItem('  • my text')).to.equal(true);
+    });
+
+    it('No Match', () => {
+        expect(isListItem('my text')).to.equal(false);
+        expect(isListItem('-my text')).to.equal(false);
+        expect(isListItem('•my text')).to.equal(false);
+        expect(isListItem(' -my text')).to.equal(false);
+        expect(isListItem('- my text -')).to.equal(false);
+        expect(isListItem('• my text •')).to.equal(false);
+    });
+
+});
+
+describe('isNumberedListItem', () => {
+
+    it('Match', () => {
+        expect(isNumberedListItem('1. my text')).to.equal(true);
+        expect(isNumberedListItem('2. my text')).to.equal(true);
+        expect(isNumberedListItem('23. my text')).to.equal(true);
+        expect(isNumberedListItem('23.   my text')).to.equal(true);
+        expect(isNumberedListItem(' 23.   my text')).to.equal(true);
+        expect(isNumberedListItem('  23.   my text')).to.equal(true);
+    });
+
+    it('No Match', () => {
+        expect(isNumberedListItem('1two')).to.equal(false);
+        expect(isNumberedListItem('1 two')).to.equal(false);
+        expect(isNumberedListItem('1.two')).to.equal(false);
+    });
+
+});