List Levels

- no tests for this... need to revise the test infrastructure and the transformation which is modifying the item contents directly
This commit is contained in:
Johannes Zillmann 2024-04-05 12:06:21 -06:00
parent 78f44a0ad9
commit fab5d4649c
24 changed files with 866 additions and 3 deletions

View File

@ -0,0 +1,39 @@
{
"pages": 526,
"items": 52758,
"groupedItems": 31211,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 199,
"items": 8436,
"groupedItems": 8308,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 75,
"items": 3043,
"groupedItems": 2561,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 24,
"items": 1264,
"groupedItems": 1164,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 7,
"items": 483,
"groupedItems": 217,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 19,
"items": 1365,
"groupedItems": 1171,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 6,
"items": 268,
"groupedItems": 145,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 4,
"items": 134,
"groupedItems": 108,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 116,
"items": 7676,
"groupedItems": 3479,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 137,
"items": 24829,
"groupedItems": 3105,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 153,
"items": 14949,
"groupedItems": 10600,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 60,
"items": 3968,
"groupedItems": 1429,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 26,
"items": 1843,
"groupedItems": 1506,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 466,
"items": 772193,
"groupedItems": 15227,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 16,
"items": 1242,
"groupedItems": 416,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 4,
"items": 522,
"groupedItems": 378,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 293,
"items": 9255,
"groupedItems": 6520,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 7,
"items": 6779,
"groupedItems": 1096,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 644,
"items": 27118,
"groupedItems": 21363,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,39 @@
{
"pages": 255,
"items": 20093,
"groupedItems": 7254,
"changes": 0,
"schema": [
{
"name": "block"
},
{
"name": "line"
},
{
"name": "types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -11,11 +11,11 @@ export default class MarkdownConverter implements Converter {
let content = ''; let content = '';
const lineMerger = new LineItemMerger(); const lineMerger = new LineItemMerger();
const evaluationTracker = new EvaluationTracker();
const changeTracker = new ChangeTracker();
groupByBlock(items).forEach((blockItems) => { groupByBlock(items).forEach((blockItems) => {
const types = blockItems[0].data['types'] || []; const types = blockItems[0].data['types'] || [];
groupByLine(blockItems).forEach((lineItems) => { groupByLine(blockItems).forEach((lineItems) => {
const evaluationTracker = new EvaluationTracker();
const changeTracker = new ChangeTracker();
const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems); const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems);
const itemText = lineItem.data['str']; const itemText = lineItem.data['str'];
content += elementToText(itemText, types[0]); content += elementToText(itemText, types[0]);

View File

@ -17,6 +17,7 @@ import NoOpTransformer from './transformer/NoOpTransformer';
import {type ParseConfig } from './parse'; import {type ParseConfig } from './parse';
import DetectListItems from './transformer/DetectListItems'; import DetectListItems from './transformer/DetectListItems';
import DetectBlocks from './transformer/DetectBlocks'; import DetectBlocks from './transformer/DetectBlocks';
import DetectListLevels from './transformer/DetectListLevels';
export const transformers = [ export const transformers = [
new AdjustHeight(), new AdjustHeight(),
@ -30,6 +31,7 @@ export const transformers = [
new DetectHeaders(), new DetectHeaders(),
new DetectListItems(), new DetectListItems(),
new DetectBlocks(), new DetectBlocks(),
new DetectListLevels(),
new NoOpTransformer(), new NoOpTransformer(),
]; ];

View File

@ -44,7 +44,7 @@ export function extractEndingNumber(text: string): number | undefined {
return undefined; return undefined;
} }
export function isListItemCharacter(string) { export function isListItemCharacter(string: string) {
if (string.length > 1) { if (string.length > 1) {
return false; return false;
} }

View File

@ -0,0 +1,81 @@
import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger';
import { groupByBlock, groupByLine } from '../support/groupingUtils';
import { TextType, toBlockType } from '../text-types';
import { isListItem, isNumberedListItem } from '../support/stringFunctions';
export default class DetectListLevels extends ItemTransformer {
constructor() {
super('Detect List Levels', 'Figure out the nesting levels of each list item', {
requireColumns: ['str', 'block', 'x'],
debug: {
// showAll: true,
itemMerger: new LineItemMerger(false),
},
});
}
// TODO instead of changing the 'str' we should annotate the item and let the converters do their thing
transform(context: TransformContext, inputItems: Item[]): ItemResult {
let listBlocks = 0;
let modifiedBlocks = 0;
groupByBlock(inputItems)
.filter((blockItems) => {
const types: TextType[] = blockItems[0].data['types'] || [];
return types.map(toBlockType).includes('LIST');
})
.forEach((blockItems) => {
let lastItemX: number;
let currentLevel = 0;
const xByLevel = {};
let modifiedBlock = false;
let isOverflowLine = false;
groupByLine(blockItems).forEach((lineItems) => {
const firstItem = lineItems[0];
const isLineItem =
isListItem(firstItem.data['str'] + ' ...') || isNumberedListItem(firstItem.data['str'] + ' ...');
const x = firstItem.data['x'];
if (lastItemX) {
if (isLineItem) {
if (x > lastItemX) {
currentLevel++;
xByLevel[x] = currentLevel;
} else if (x < lastItemX) {
currentLevel = xByLevel[x];
}
} else {
// current level remains the seame
isOverflowLine = true;
}
} else {
xByLevel[x] = 0;
}
if (currentLevel > 0) {
lineItems[0].data['str'] = ' '.repeat(currentLevel * 3) + lineItems[0].data['str'];
modifiedBlock = true;
if (isOverflowLine) {
lineItems[0].data['str'] = ' ' + lineItems[0].data['str'];
}
}
if (!isOverflowLine) {
lastItemX = x;
}
isOverflowLine = false;
});
listBlocks++;
if (modifiedBlock) {
modifiedBlocks++;
}
});
return {
items: inputItems.map((item) => {
return item;
}),
messages: ['Modified ' + modifiedBlocks + ' / ' + listBlocks + ' list blocks.'],
};
}
}