mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-27 04:51:37 +02:00
Base markdown converter on blocks
This commit is contained in:
parent
16e5a62951
commit
78f44a0ad9
@ -1,38 +1,42 @@
|
||||
import { groupByLine } from '../support/groupingUtils';
|
||||
import { groupByBlock, groupByLine } from '../support/groupingUtils';
|
||||
import Item from '../Item';
|
||||
import { Converter } from '../convert';
|
||||
import ChangeTracker from '../debug/ChangeTracker';
|
||||
import EvaluationTracker from '../debug/EvaluationTracker';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import { TextType, headlineLevel } from '../text-types';
|
||||
|
||||
export default class MarkdownConverter implements Converter {
|
||||
convert(items: Item[]) {
|
||||
let content = '';
|
||||
|
||||
const itemsByLine = groupByLine(items);
|
||||
const lineMerger = new LineItemMerger();
|
||||
itemsByLine.forEach((lineItems) => {
|
||||
const lineItem = lineMerger.merge(new EvaluationTracker(), new ChangeTracker(), ['types'], lineItems);
|
||||
const types = lineItem.data['types'] || [];
|
||||
const itemText = lineItem.data['str'];
|
||||
if (types.includes('H1')) {
|
||||
content += '# ' + itemText + '\n';
|
||||
} else if (types.includes('H2')) {
|
||||
content += '## ' + itemText + '\n';
|
||||
} else if (types.includes('H3')) {
|
||||
content += '### ' + itemText + '\n';
|
||||
} else if (types.includes('H4')) {
|
||||
content += '#### ' + itemText + '\n';
|
||||
} else if (types.includes('H5')) {
|
||||
content += '##### ' + itemText + '\n';
|
||||
} else if (types.includes('H6')) {
|
||||
content += '###### ' + itemText + '\n';
|
||||
} else {
|
||||
content += itemText;
|
||||
}
|
||||
groupByBlock(items).forEach((blockItems) => {
|
||||
const types = blockItems[0].data['types'] || [];
|
||||
groupByLine(blockItems).forEach((lineItems) => {
|
||||
const evaluationTracker = new EvaluationTracker();
|
||||
const changeTracker = new ChangeTracker();
|
||||
const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems);
|
||||
const itemText = lineItem.data['str'];
|
||||
content += elementToText(itemText, types[0]);
|
||||
});
|
||||
content += '\n';
|
||||
});
|
||||
|
||||
return content;
|
||||
}
|
||||
}
|
||||
|
||||
function elementToText(text: string, type: TextType) {
|
||||
switch (type) {
|
||||
case 'H1':
|
||||
case 'H2':
|
||||
case 'H3':
|
||||
case 'H4':
|
||||
case 'H5':
|
||||
case 'H6':
|
||||
return '#'.repeat(headlineLevel(type)) + ' ' + text + '\n';
|
||||
default:
|
||||
return text + '\n';
|
||||
}
|
||||
}
|
||||
|
@ -72,6 +72,10 @@ export function groupByPage(items: Item[]): Item[][] {
|
||||
return groupBy(items, (item) => item.page);
|
||||
}
|
||||
|
||||
export function groupByBlock(items: Item[]): Item[][] {
|
||||
return groupByElement(items, 'block');
|
||||
}
|
||||
|
||||
export function groupByLine(items: Item[]): Item[][] {
|
||||
return groupByElement(items, 'line');
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user