[WIP] improve TOC parsing

This commit is contained in:
Johannes Zillmann 2017-03-07 08:47:50 +01:00
parent 1fcd08f6d5
commit c9352d8396
2 changed files with 156 additions and 12 deletions

View File

@ -3,26 +3,58 @@ import TextItemCombiner from './TextItemCombiner.jsx';
import TextItem from './TextItem.jsx';
export const HEADLINE1 = "Headline 1";
export const HEADLINE2 = "Headline 2";
export const HEADLINE3 = "Headline 3";
export const HEADLINE4 = "Headline 4";
export const HEADLINE5 = "Headline 5";
export const HEADLINE6 = "Headline 6";
export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC";
export const FOOTNOTE_BLOCK = "Footnotes"
export function headlineByLevel(level) {
if (level == 1) {
return HEADLINE1;
} else if (level == 2) {
return HEADLINE2;
} else if (level == 3) {
return HEADLINE3;
} else if (level == 4) {
return HEADLINE4;
} else if (level == 5) {
return HEADLINE5;
} else if (level == 6) {
return HEADLINE6;
}
throw "Unsupported headline level: " + level;
}
export function blockToText(block: PdfBlock) {
switch (block.type) {
case CODE_BLOCK:
return '```\n' + concatTextItems(block.textItems) + '```'
case TOC_BLOCK:
//TODO 2nd level
//TODO real links
var text = '';
//TODO real links
//TODO de-duplicate with DetectLists ?
block.textItems.forEach(item => {
text += '- ' + item.text + '\n';
text += item.text + '\n';
});
return text;
case HEADLINE1:
return '#' + concatTextItems(block.textItems);
return '# ' + concatTextItems(block.textItems);
case HEADLINE2:
return '## ' + concatTextItems(block.textItems);
case HEADLINE3:
return '### ' + concatTextItems(block.textItems);
case HEADLINE4:
return '#### ' + concatTextItems(block.textItems);
case HEADLINE5:
return '##### ' + concatTextItems(block.textItems);
case HEADLINE6:
return '###### ' + concatTextItems(block.textItems);
default:
var textItems = block.textItems;
if (!block.type) {

View File

@ -16,21 +16,24 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundTocPages = 0;
const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
var lastLevel = 0;
const itemLeveler = new ItemLeveler();
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0;
var linesWithDigitsCount = 0;
var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock;
page.blocks.forEach(block => {
var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
var lastLineTextWithoutNumber;
itemsGroupedByY.forEach(lineItem => {
linesCount++
var lineText = lineItem.text.replace(/\./g, '').trim();
@ -41,26 +44,37 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
}
lineText = lineText.trim();
if (endsWithDigit) {
if (lastLineTextWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText;
lastLineTextWithoutNumber = null;
}
linesWithDigitsCount++;
blockHasLinesWithDigits = true;
lineItemsWithDigits.push(new TextItem({
...lineItem,
text: lineText
}));
} else {
lastLineTextWithoutNumber = lineText;
}
});
if (!headlineBlock && !blockHasLinesWithDigits) {
headlineBlock = block;
if (!blockHasLinesWithDigits) {
if (!headlineBlock) {
headlineBlock = block;
} else {
unknownBlocks.add(block);
}
}
});
if (linesWithDigitsCount * 100 / linesCount > 75) {
foundTocPages++;
tocPages.push(page.index + 1);
const newBlocks = [];
page.blocks.forEach((block) => {
block.annotation = REMOVED_ANNOTATION;
if (!unknownBlocks.has(block)) {
block.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(block);
if (block === headlineBlock) {
newBlocks.push(new PdfBlock({
textItems: textCombiner.combine(block.textItems).textItems,
@ -69,6 +83,8 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
}));
}
});
// lastLevel = processLevels(lineItemsWithDigits, lastLevel);
itemLeveler.level(lineItemsWithDigits);
newBlocks.push(new PdfBlock({
textItems: lineItemsWithDigits,
type: TOC_BLOCK,
@ -80,8 +96,104 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
return new ParseResult({
...parseResult,
messages: ['Detected ' + foundTocPages + ' table of content pages']
globals: {
...parseResult.globals,
tocPages: tocPages
},
messages: ['Detected ' + tocPages.length + ' table of content pages']
});
}
}
class ItemLeveler {
constructor() {
this.levelByMethod = null;
this.uniqueFonts = [];
this.headlines = [];
}
level(lineItemsWithDigits) {
if (!this.levelByMethod) {
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
if (uniqueX.length > 1) {
this.levelByMethod = this.levelByXDiff;
} else {
const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
if (uniqueFonts.length > 1) {
this.uniqueFonts = uniqueFonts;
this.levelByMethod = this.levelByFont;
} else {
this.levelByMethod = this.levelToZero;
}
}
}
this.levelByMethod(lineItemsWithDigits);
}
levelByXDiff(lineItemsWithDigits) {
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
lineItemsWithDigits.forEach(item => {
const level = uniqueX.indexOf(item.x);
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
levelByFont(lineItemsWithDigits) {
lineItemsWithDigits.forEach(item => {
const level = this.uniqueFonts.indexOf(item.font);
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
levelToZero(lineItemsWithDigits) {
lineItemsWithDigits.forEach(item => {
const level = 0;
this.headlines.push(new Headline({
level: level,
text: item.text
}));
item.text = ' '.repeat(level * 3) + '- ' + item.text;
});
}
calculateUniqueX(lineItemsWithDigits) {
var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
return uniquesArray;
}, []);
uniqueX.sort((a, b) => {
return a - b
});
return uniqueX;
}
calculateUniqueFonts(lineItemsWithDigits) {
var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
return uniquesArray;
}, []);
return uniqueFont;
}
}
class Headline {
constructor(options) {
this.level = options.level;
this.text = options.text;
}
}