mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-27 13:01:26 +02:00
Base markdown converter on blocks
This commit is contained in:
parent
16e5a62951
commit
78f44a0ad9
@ -1,38 +1,42 @@
|
|||||||
import { groupByLine } from '../support/groupingUtils';
|
import { groupByBlock, groupByLine } from '../support/groupingUtils';
|
||||||
import Item from '../Item';
|
import Item from '../Item';
|
||||||
import { Converter } from '../convert';
|
import { Converter } from '../convert';
|
||||||
import ChangeTracker from '../debug/ChangeTracker';
|
import ChangeTracker from '../debug/ChangeTracker';
|
||||||
import EvaluationTracker from '../debug/EvaluationTracker';
|
import EvaluationTracker from '../debug/EvaluationTracker';
|
||||||
import LineItemMerger from '../debug/LineItemMerger';
|
import LineItemMerger from '../debug/LineItemMerger';
|
||||||
|
import { TextType, headlineLevel } from '../text-types';
|
||||||
|
|
||||||
export default class MarkdownConverter implements Converter {
|
export default class MarkdownConverter implements Converter {
|
||||||
convert(items: Item[]) {
|
convert(items: Item[]) {
|
||||||
let content = '';
|
let content = '';
|
||||||
|
|
||||||
const itemsByLine = groupByLine(items);
|
|
||||||
const lineMerger = new LineItemMerger();
|
const lineMerger = new LineItemMerger();
|
||||||
itemsByLine.forEach((lineItems) => {
|
groupByBlock(items).forEach((blockItems) => {
|
||||||
const lineItem = lineMerger.merge(new EvaluationTracker(), new ChangeTracker(), ['types'], lineItems);
|
const types = blockItems[0].data['types'] || [];
|
||||||
const types = lineItem.data['types'] || [];
|
groupByLine(blockItems).forEach((lineItems) => {
|
||||||
const itemText = lineItem.data['str'];
|
const evaluationTracker = new EvaluationTracker();
|
||||||
if (types.includes('H1')) {
|
const changeTracker = new ChangeTracker();
|
||||||
content += '# ' + itemText + '\n';
|
const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems);
|
||||||
} else if (types.includes('H2')) {
|
const itemText = lineItem.data['str'];
|
||||||
content += '## ' + itemText + '\n';
|
content += elementToText(itemText, types[0]);
|
||||||
} else if (types.includes('H3')) {
|
});
|
||||||
content += '### ' + itemText + '\n';
|
|
||||||
} else if (types.includes('H4')) {
|
|
||||||
content += '#### ' + itemText + '\n';
|
|
||||||
} else if (types.includes('H5')) {
|
|
||||||
content += '##### ' + itemText + '\n';
|
|
||||||
} else if (types.includes('H6')) {
|
|
||||||
content += '###### ' + itemText + '\n';
|
|
||||||
} else {
|
|
||||||
content += itemText;
|
|
||||||
}
|
|
||||||
content += '\n';
|
content += '\n';
|
||||||
});
|
});
|
||||||
|
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function elementToText(text: string, type: TextType) {
|
||||||
|
switch (type) {
|
||||||
|
case 'H1':
|
||||||
|
case 'H2':
|
||||||
|
case 'H3':
|
||||||
|
case 'H4':
|
||||||
|
case 'H5':
|
||||||
|
case 'H6':
|
||||||
|
return '#'.repeat(headlineLevel(type)) + ' ' + text + '\n';
|
||||||
|
default:
|
||||||
|
return text + '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -72,6 +72,10 @@ export function groupByPage(items: Item[]): Item[][] {
|
|||||||
return groupBy(items, (item) => item.page);
|
return groupBy(items, (item) => item.page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function groupByBlock(items: Item[]): Item[][] {
|
||||||
|
return groupByElement(items, 'block');
|
||||||
|
}
|
||||||
|
|
||||||
export function groupByLine(items: Item[]): Item[][] {
|
export function groupByLine(items: Item[]): Item[][] {
|
||||||
return groupByElement(items, 'line');
|
return groupByElement(items, 'line');
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user