[WIP] improve TOC parsing

This commit is contained in:
Johannes Zillmann 2017-03-07 08:47:50 +01:00
parent 1fcd08f6d5
commit c9352d8396
2 changed files with 156 additions and 12 deletions

View File

@ -3,26 +3,58 @@ import TextItemCombiner from './TextItemCombiner.jsx';
import TextItem from './TextItem.jsx'; import TextItem from './TextItem.jsx';
export const HEADLINE1 = "Headline 1"; export const HEADLINE1 = "Headline 1";
export const HEADLINE2 = "Headline 2";
export const HEADLINE3 = "Headline 3";
export const HEADLINE4 = "Headline 4";
export const HEADLINE5 = "Headline 5";
export const HEADLINE6 = "Headline 6";
export const PARAGRAPH = "Paragraph"; export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List"; export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote"; export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC"; export const TOC_BLOCK = "TOC";
export const FOOTNOTE_BLOCK = "Footnotes" export const FOOTNOTE_BLOCK = "Footnotes"
export function headlineByLevel(level) {
if (level == 1) {
return HEADLINE1;
} else if (level == 2) {
return HEADLINE2;
} else if (level == 3) {
return HEADLINE3;
} else if (level == 4) {
return HEADLINE4;
} else if (level == 5) {
return HEADLINE5;
} else if (level == 6) {
return HEADLINE6;
}
throw "Unsupported headline level: " + level;
}
export function blockToText(block: PdfBlock) { export function blockToText(block: PdfBlock) {
switch (block.type) { switch (block.type) {
case CODE_BLOCK: case CODE_BLOCK:
return '```\n' + concatTextItems(block.textItems) + '```' return '```\n' + concatTextItems(block.textItems) + '```'
case TOC_BLOCK: case TOC_BLOCK:
//TODO 2nd level
//TODO real links
var text = ''; var text = '';
//TODO real links
//TODO de-duplicate with DetectLists ?
block.textItems.forEach(item => { block.textItems.forEach(item => {
text += '- ' + item.text + '\n'; text += item.text + '\n';
}); });
return text; return text;
case HEADLINE1: case HEADLINE1:
return '#' + concatTextItems(block.textItems); return '# ' + concatTextItems(block.textItems);
case HEADLINE2:
return '## ' + concatTextItems(block.textItems);
case HEADLINE3:
return '### ' + concatTextItems(block.textItems);
case HEADLINE4:
return '#### ' + concatTextItems(block.textItems);
case HEADLINE5:
return '##### ' + concatTextItems(block.textItems);
case HEADLINE6:
return '###### ' + concatTextItems(block.textItems);
default: default:
var textItems = block.textItems; var textItems = block.textItems;
if (!block.type) { if (!block.type) {

View File

@ -16,21 +16,24 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals; const {mostUsedDistance} = parseResult.globals;
var foundTocPages = 0; const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.content.length); const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
const textCombiner = new TextItemCombiner({ const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance mostUsedDistance: mostUsedDistance
}); });
var lastLevel = 0;
const itemLeveler = new ItemLeveler();
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => { parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0; var linesCount = 0;
var linesWithDigitsCount = 0; var linesWithDigitsCount = 0;
var lineItemsWithDigits = []; var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock; var headlineBlock;
page.blocks.forEach(block => { page.blocks.forEach(block => {
var blockHasLinesWithDigits = false; var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
var lastLineTextWithoutNumber;
itemsGroupedByY.forEach(lineItem => { itemsGroupedByY.forEach(lineItem => {
linesCount++ linesCount++
var lineText = lineItem.text.replace(/\./g, '').trim(); var lineText = lineItem.text.replace(/\./g, '').trim();
@ -41,26 +44,37 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
} }
lineText = lineText.trim(); lineText = lineText.trim();
if (endsWithDigit) { if (endsWithDigit) {
if (lastLineTextWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText;
lastLineTextWithoutNumber = null;
}
linesWithDigitsCount++; linesWithDigitsCount++;
blockHasLinesWithDigits = true; blockHasLinesWithDigits = true;
lineItemsWithDigits.push(new TextItem({ lineItemsWithDigits.push(new TextItem({
...lineItem, ...lineItem,
text: lineText text: lineText
})); }));
} else {
lastLineTextWithoutNumber = lineText;
} }
}); });
if (!headlineBlock && !blockHasLinesWithDigits) { if (!blockHasLinesWithDigits) {
headlineBlock = block; if (!headlineBlock) {
headlineBlock = block;
} else {
unknownBlocks.add(block);
}
} }
}); });
if (linesWithDigitsCount * 100 / linesCount > 75) { if (linesWithDigitsCount * 100 / linesCount > 75) {
foundTocPages++; tocPages.push(page.index + 1);
const newBlocks = []; const newBlocks = [];
page.blocks.forEach((block) => { page.blocks.forEach((block) => {
block.annotation = REMOVED_ANNOTATION; if (!unknownBlocks.has(block)) {
block.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(block); newBlocks.push(block);
if (block === headlineBlock) { if (block === headlineBlock) {
newBlocks.push(new PdfBlock({ newBlocks.push(new PdfBlock({
textItems: textCombiner.combine(block.textItems).textItems, textItems: textCombiner.combine(block.textItems).textItems,
@ -69,6 +83,8 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
})); }));
} }
}); });
// lastLevel = processLevels(lineItemsWithDigits, lastLevel);
itemLeveler.level(lineItemsWithDigits);
newBlocks.push(new PdfBlock({ newBlocks.push(new PdfBlock({
textItems: lineItemsWithDigits, textItems: lineItemsWithDigits,
type: TOC_BLOCK, type: TOC_BLOCK,
@ -80,8 +96,104 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: ['Detected ' + foundTocPages + ' table of content pages'] globals: {
...parseResult.globals,
tocPages: tocPages
},
messages: ['Detected ' + tocPages.length + ' table of content pages']
}); });
} }
} }
class ItemLeveler {
constructor() {
this.levelByMethod = null;
this.uniqueFonts = [];
this.headlines = [];
}
level(lineItemsWithDigits) {
if (!this.levelByMethod) {
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
if (uniqueX.length > 1) {
this.levelByMethod = this.levelByXDiff;
} else {
const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
if (uniqueFonts.length > 1) {
this.uniqueFonts = uniqueFonts;
this.levelByMethod = this.levelByFont;
} else {
this.levelByMethod = this.levelToZero;
}
}
}
this.levelByMethod(lineItemsWithDigits);
}
levelByXDiff(lineItemsWithDigits) {
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
lineItemsWithDigits.forEach(item => {
const level = uniqueX.indexOf(item.x);
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
levelByFont(lineItemsWithDigits) {
lineItemsWithDigits.forEach(item => {
const level = this.uniqueFonts.indexOf(item.font);
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
levelToZero(lineItemsWithDigits) {
lineItemsWithDigits.forEach(item => {
const level = 0;
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
calculateUniqueX(lineItemsWithDigits) {
var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
return uniquesArray;
}, []);
uniqueX.sort((a, b) => {
return a - b
});
return uniqueX;
}
calculateUniqueFonts(lineItemsWithDigits) {
var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
return uniquesArray;
}, []);
return uniqueFont;
}
}
class Headline {
constructor(options) {
this.level = options.level;
this.text = options.text;
}
}