diff --git a/examples/Achieving-The-Paris-Climate-Agreement/detectListLevels.json b/examples/Achieving-The-Paris-Climate-Agreement/detectListLevels.json new file mode 100644 index 0000000..7948010 --- /dev/null +++ b/examples/Achieving-The-Paris-Climate-Agreement/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 526, + "items": 52758, + "groupedItems": 31211, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Adventures-Of-Sherlock-Holmes/detectListLevels.json b/examples/Adventures-Of-Sherlock-Holmes/detectListLevels.json new file mode 100644 index 0000000..e952320 --- /dev/null +++ b/examples/Adventures-Of-Sherlock-Holmes/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 199, + "items": 8436, + "groupedItems": 8308, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Alice-In-Wonderland/detectListLevels.json b/examples/Alice-In-Wonderland/detectListLevels.json new file mode 100644 index 0000000..8418318 --- /dev/null +++ b/examples/Alice-In-Wonderland/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 75, + "items": 3043, + "groupedItems": 2561, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/CC-NC_Leitfaden/detectListLevels.json b/examples/CC-NC_Leitfaden/detectListLevels.json new file mode 100644 index 0000000..12ddfc8 --- /dev/null +++ b/examples/CC-NC_Leitfaden/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 24, + "items": 1264, + "groupedItems": 1164, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/CC_License_Agreement_of_siMPle/detectListLevels.json b/examples/CC_License_Agreement_of_siMPle/detectListLevels.json new file mode 100644 index 0000000..d8e2d09 --- /dev/null +++ b/examples/CC_License_Agreement_of_siMPle/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 7, + "items": 483, + "groupedItems": 217, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Closed-Syllables/detectListLevels.json b/examples/Closed-Syllables/detectListLevels.json new file mode 100644 index 0000000..4f73ac2 --- /dev/null +++ b/examples/Closed-Syllables/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 19, + "items": 1365, + "groupedItems": 1171, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/ExamplePdf/detectListLevels.json b/examples/ExamplePdf/detectListLevels.json new file mode 100644 index 0000000..ecb71f0 --- /dev/null +++ b/examples/ExamplePdf/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 6, + "items": 268, + "groupedItems": 145, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Flash-Masques-Temperature/detectListLevels.json b/examples/Flash-Masques-Temperature/detectListLevels.json new file mode 100644 index 0000000..187ea21 --- /dev/null +++ b/examples/Flash-Masques-Temperature/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 4, + "items": 134, + "groupedItems": 108, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Grammar-Matters/detectListLevels.json b/examples/Grammar-Matters/detectListLevels.json new file mode 100644 index 0000000..0531446 --- /dev/null +++ b/examples/Grammar-Matters/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 116, + "items": 7676, + "groupedItems": 3479, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Life-Of-God-In-Soul-Of-Man/detectListLevels.json b/examples/Life-Of-God-In-Soul-Of-Man/detectListLevels.json new file mode 100644 index 0000000..4988adf --- /dev/null +++ b/examples/Life-Of-God-In-Soul-Of-Man/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 137, + "items": 24829, + "groupedItems": 3105, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Made-with-cc/detectListLevels.json b/examples/Made-with-cc/detectListLevels.json new file mode 100644 index 0000000..1c55071 --- /dev/null +++ b/examples/Made-with-cc/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 153, + "items": 14949, + "groupedItems": 10600, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Safe-Communication/detectListLevels.json b/examples/Safe-Communication/detectListLevels.json new file mode 100644 index 0000000..86badbf --- /dev/null +++ b/examples/Safe-Communication/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 60, + "items": 3968, + "groupedItems": 1429, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/St-Mary-Witney-Social-Audit/detectListLevels.json b/examples/St-Mary-Witney-Social-Audit/detectListLevels.json new file mode 100644 index 0000000..5f20852 --- /dev/null +++ b/examples/St-Mary-Witney-Social-Audit/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 26, + "items": 1843, + "groupedItems": 1506, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Art-of-Public-Speaking/detectListLevels.json b/examples/The-Art-of-Public-Speaking/detectListLevels.json new file mode 100644 index 0000000..ae15487 --- /dev/null +++ b/examples/The-Art-of-Public-Speaking/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 466, + "items": 772193, + "groupedItems": 15227, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectListLevels.json b/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectListLevels.json new file mode 100644 index 0000000..6ecb4de --- /dev/null +++ b/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 16, + "items": 1242, + "groupedItems": 416, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Man-Without-A-Body/detectListLevels.json b/examples/The-Man-Without-A-Body/detectListLevels.json new file mode 100644 index 0000000..8f04ae6 --- /dev/null +++ b/examples/The-Man-Without-A-Body/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 4, + "items": 522, + "groupedItems": 378, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-War-of-the-Worlds/detectListLevels.json b/examples/The-War-of-the-Worlds/detectListLevels.json new file mode 100644 index 0000000..a3097c0 --- /dev/null +++ b/examples/The-War-of-the-Worlds/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 293, + "items": 9255, + "groupedItems": 6520, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Tragedy-Of-The-Commons/detectListLevels.json b/examples/Tragedy-Of-The-Commons/detectListLevels.json new file mode 100644 index 0000000..d9da861 --- /dev/null +++ b/examples/Tragedy-Of-The-Commons/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 7, + "items": 6779, + "groupedItems": 1096, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Watered-Soul-Blog-Book/detectListLevels.json b/examples/Watered-Soul-Blog-Book/detectListLevels.json new file mode 100644 index 0000000..7989a36 --- /dev/null +++ b/examples/Watered-Soul-Blog-Book/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 644, + "items": 27118, + "groupedItems": 21363, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/WoodUp/detectListLevels.json b/examples/WoodUp/detectListLevels.json new file mode 100644 index 0000000..1190b74 --- /dev/null +++ b/examples/WoodUp/detectListLevels.json @@ -0,0 +1,39 @@ +{ + "pages": 255, + "items": 20093, + "groupedItems": 7254, + "changes": 0, + "schema": [ + { + "name": "block" + }, + { + "name": "line" + }, + { + "name": "types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/src/convert/MarkdownConverter.ts b/src/convert/MarkdownConverter.ts index 08065e2..313bacb 100644 --- a/src/convert/MarkdownConverter.ts +++ b/src/convert/MarkdownConverter.ts @@ -11,11 +11,11 @@ export default class MarkdownConverter implements Converter { let content = ''; const lineMerger = new LineItemMerger(); + const evaluationTracker = new EvaluationTracker(); + const changeTracker = new ChangeTracker(); groupByBlock(items).forEach((blockItems) => { const types = blockItems[0].data['types'] || []; groupByLine(blockItems).forEach((lineItems) => { - const evaluationTracker = new EvaluationTracker(); - const changeTracker = new ChangeTracker(); const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems); const itemText = lineItem.data['str']; content += elementToText(itemText, types[0]); diff --git a/src/index.ts b/src/index.ts index e4b4d77..c32d83c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ import NoOpTransformer from './transformer/NoOpTransformer'; import {type ParseConfig } from './parse'; import DetectListItems from './transformer/DetectListItems'; import DetectBlocks from './transformer/DetectBlocks'; +import DetectListLevels from './transformer/DetectListLevels'; export const transformers = [ new AdjustHeight(), @@ -30,6 +31,7 @@ export const transformers = [ new DetectHeaders(), new DetectListItems(), new DetectBlocks(), + new DetectListLevels(), new NoOpTransformer(), ]; diff --git a/src/support/stringFunctions.ts b/src/support/stringFunctions.ts index fcacae6..d70f6f1 100644 --- a/src/support/stringFunctions.ts +++ b/src/support/stringFunctions.ts @@ -44,7 +44,7 @@ export function extractEndingNumber(text: string): number | undefined { return undefined; } -export function isListItemCharacter(string) { +export function isListItemCharacter(string: string) { if (string.length > 1) { return false; } diff --git a/src/transformer/DetectListLevels.ts b/src/transformer/DetectListLevels.ts new file mode 100644 index 0000000..d03ca37 --- /dev/null +++ b/src/transformer/DetectListLevels.ts @@ -0,0 +1,81 @@ +import Item from '../Item'; +import ItemResult from '../ItemResult'; +import ItemTransformer from './ItemTransformer'; +import TransformContext from './TransformContext'; +import LineItemMerger from '../debug/LineItemMerger'; +import { groupByBlock, groupByLine } from '../support/groupingUtils'; +import { TextType, toBlockType } from '../text-types'; +import { isListItem, isNumberedListItem } from '../support/stringFunctions'; + +export default class DetectListLevels extends ItemTransformer { + constructor() { + super('Detect List Levels', 'Figure out the nesting levels of each list item', { + requireColumns: ['str', 'block', 'x'], + debug: { + // showAll: true, + itemMerger: new LineItemMerger(false), + }, + }); + } + + // TODO instead of changing the 'str' we should annotate the item and let the converters do their thing + transform(context: TransformContext, inputItems: Item[]): ItemResult { + let listBlocks = 0; + let modifiedBlocks = 0; + groupByBlock(inputItems) + .filter((blockItems) => { + const types: TextType[] = blockItems[0].data['types'] || []; + return types.map(toBlockType).includes('LIST'); + }) + .forEach((blockItems) => { + let lastItemX: number; + let currentLevel = 0; + const xByLevel = {}; + let modifiedBlock = false; + let isOverflowLine = false; + groupByLine(blockItems).forEach((lineItems) => { + const firstItem = lineItems[0]; + const isLineItem = + isListItem(firstItem.data['str'] + ' ...') || isNumberedListItem(firstItem.data['str'] + ' ...'); + const x = firstItem.data['x']; + if (lastItemX) { + if (isLineItem) { + if (x > lastItemX) { + currentLevel++; + xByLevel[x] = currentLevel; + } else if (x < lastItemX) { + currentLevel = xByLevel[x]; + } + } else { + // current level remains the seame + isOverflowLine = true; + } + } else { + xByLevel[x] = 0; + } + if (currentLevel > 0) { + lineItems[0].data['str'] = ' '.repeat(currentLevel * 3) + lineItems[0].data['str']; + modifiedBlock = true; + if (isOverflowLine) { + lineItems[0].data['str'] = ' ' + lineItems[0].data['str']; + } + } + if (!isOverflowLine) { + lastItemX = x; + } + isOverflowLine = false; + }); + listBlocks++; + if (modifiedBlock) { + modifiedBlocks++; + } + }); + + return { + items: inputItems.map((item) => { + return item; + }), + messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.'], + }; + } +}