mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-27 23:30:48 +01:00
[WIP] improve TOC parsing
This commit is contained in:
parent
1fcd08f6d5
commit
c9352d8396
@ -3,26 +3,58 @@ import TextItemCombiner from './TextItemCombiner.jsx';
|
||||
import TextItem from './TextItem.jsx';
|
||||
|
||||
export const HEADLINE1 = "Headline 1";
|
||||
export const HEADLINE2 = "Headline 2";
|
||||
export const HEADLINE3 = "Headline 3";
|
||||
export const HEADLINE4 = "Headline 4";
|
||||
export const HEADLINE5 = "Headline 5";
|
||||
export const HEADLINE6 = "Headline 6";
|
||||
export const PARAGRAPH = "Paragraph";
|
||||
export const LIST_BLOCK = "List";
|
||||
export const CODE_BLOCK = "Code/Quote";
|
||||
export const TOC_BLOCK = "TOC";
|
||||
export const FOOTNOTE_BLOCK = "Footnotes"
|
||||
|
||||
export function headlineByLevel(level) {
|
||||
if (level == 1) {
|
||||
return HEADLINE1;
|
||||
} else if (level == 2) {
|
||||
return HEADLINE2;
|
||||
} else if (level == 3) {
|
||||
return HEADLINE3;
|
||||
} else if (level == 4) {
|
||||
return HEADLINE4;
|
||||
} else if (level == 5) {
|
||||
return HEADLINE5;
|
||||
} else if (level == 6) {
|
||||
return HEADLINE6;
|
||||
}
|
||||
throw "Unsupported headline level: " + level;
|
||||
}
|
||||
|
||||
export function blockToText(block: PdfBlock) {
|
||||
switch (block.type) {
|
||||
case CODE_BLOCK:
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
case TOC_BLOCK:
|
||||
//TODO 2nd level
|
||||
//TODO real links
|
||||
var text = '';
|
||||
//TODO real links
|
||||
//TODO de-duplicate with DetectLists ?
|
||||
block.textItems.forEach(item => {
|
||||
text += '- ' + item.text + '\n';
|
||||
text += item.text + '\n';
|
||||
});
|
||||
return text;
|
||||
case HEADLINE1:
|
||||
return '#' + concatTextItems(block.textItems);
|
||||
return '# ' + concatTextItems(block.textItems);
|
||||
case HEADLINE2:
|
||||
return '## ' + concatTextItems(block.textItems);
|
||||
case HEADLINE3:
|
||||
return '### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE4:
|
||||
return '#### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE5:
|
||||
return '##### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE6:
|
||||
return '###### ' + concatTextItems(block.textItems);
|
||||
default:
|
||||
var textItems = block.textItems;
|
||||
if (!block.type) {
|
||||
|
@ -16,21 +16,24 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundTocPages = 0;
|
||||
const tocPages = [];
|
||||
const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
|
||||
var lastLevel = 0;
|
||||
const itemLeveler = new ItemLeveler();
|
||||
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||
var linesCount = 0;
|
||||
var linesWithDigitsCount = 0;
|
||||
var lineItemsWithDigits = [];
|
||||
const unknownBlocks = new Set();
|
||||
var headlineBlock;
|
||||
page.blocks.forEach(block => {
|
||||
var blockHasLinesWithDigits = false;
|
||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||
var lastLineTextWithoutNumber;
|
||||
itemsGroupedByY.forEach(lineItem => {
|
||||
linesCount++
|
||||
var lineText = lineItem.text.replace(/\./g, '').trim();
|
||||
@ -41,26 +44,37 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
lineText = lineText.trim();
|
||||
if (endsWithDigit) {
|
||||
if (lastLineTextWithoutNumber) { // 2-line item ?
|
||||
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
||||
lastLineTextWithoutNumber = null;
|
||||
}
|
||||
linesWithDigitsCount++;
|
||||
blockHasLinesWithDigits = true;
|
||||
lineItemsWithDigits.push(new TextItem({
|
||||
...lineItem,
|
||||
text: lineText
|
||||
}));
|
||||
} else {
|
||||
lastLineTextWithoutNumber = lineText;
|
||||
}
|
||||
});
|
||||
if (!headlineBlock && !blockHasLinesWithDigits) {
|
||||
headlineBlock = block;
|
||||
if (!blockHasLinesWithDigits) {
|
||||
if (!headlineBlock) {
|
||||
headlineBlock = block;
|
||||
} else {
|
||||
unknownBlocks.add(block);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
||||
foundTocPages++;
|
||||
tocPages.push(page.index + 1);
|
||||
const newBlocks = [];
|
||||
page.blocks.forEach((block) => {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
if (!unknownBlocks.has(block)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
newBlocks.push(block);
|
||||
|
||||
if (block === headlineBlock) {
|
||||
newBlocks.push(new PdfBlock({
|
||||
textItems: textCombiner.combine(block.textItems).textItems,
|
||||
@ -69,6 +83,8 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
}));
|
||||
}
|
||||
});
|
||||
// lastLevel = processLevels(lineItemsWithDigits, lastLevel);
|
||||
itemLeveler.level(lineItemsWithDigits);
|
||||
newBlocks.push(new PdfBlock({
|
||||
textItems: lineItemsWithDigits,
|
||||
type: TOC_BLOCK,
|
||||
@ -80,8 +96,104 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Detected ' + foundTocPages + ' table of content pages']
|
||||
globals: {
|
||||
...parseResult.globals,
|
||||
tocPages: tocPages
|
||||
|
||||
},
|
||||
messages: ['Detected ' + tocPages.length + ' table of content pages']
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class ItemLeveler {
|
||||
constructor() {
|
||||
this.levelByMethod = null;
|
||||
this.uniqueFonts = [];
|
||||
this.headlines = [];
|
||||
}
|
||||
|
||||
level(lineItemsWithDigits) {
|
||||
if (!this.levelByMethod) {
|
||||
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
|
||||
if (uniqueX.length > 1) {
|
||||
this.levelByMethod = this.levelByXDiff;
|
||||
} else {
|
||||
const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
|
||||
if (uniqueFonts.length > 1) {
|
||||
this.uniqueFonts = uniqueFonts;
|
||||
this.levelByMethod = this.levelByFont;
|
||||
} else {
|
||||
this.levelByMethod = this.levelToZero;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.levelByMethod(lineItemsWithDigits);
|
||||
}
|
||||
|
||||
levelByXDiff(lineItemsWithDigits) {
|
||||
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
|
||||
lineItemsWithDigits.forEach(item => {
|
||||
const level = uniqueX.indexOf(item.x);
|
||||
this.headlines.push(new Headline({
|
||||
level: level,
|
||||
text: item.text
|
||||
}));
|
||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
||||
});
|
||||
}
|
||||
|
||||
levelByFont(lineItemsWithDigits) {
|
||||
lineItemsWithDigits.forEach(item => {
|
||||
const level = this.uniqueFonts.indexOf(item.font);
|
||||
this.headlines.push(new Headline({
|
||||
level: level,
|
||||
text: item.text
|
||||
}));
|
||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
||||
});
|
||||
}
|
||||
|
||||
levelToZero(lineItemsWithDigits) {
|
||||
lineItemsWithDigits.forEach(item => {
|
||||
const level = 0;
|
||||
this.headlines.push(new Headline({
|
||||
level: level,
|
||||
text: item.text
|
||||
}));
|
||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
||||
});
|
||||
}
|
||||
|
||||
calculateUniqueX(lineItemsWithDigits) {
|
||||
var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
|
||||
if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
|
||||
return uniquesArray;
|
||||
}, []);
|
||||
|
||||
uniqueX.sort((a, b) => {
|
||||
return a - b
|
||||
});
|
||||
|
||||
return uniqueX;
|
||||
}
|
||||
|
||||
calculateUniqueFonts(lineItemsWithDigits) {
|
||||
var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
|
||||
if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
|
||||
return uniquesArray;
|
||||
}, []);
|
||||
|
||||
return uniqueFont;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class Headline {
|
||||
constructor(options) {
|
||||
this.level = options.level;
|
||||
this.text = options.text;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user