WIP initial TOC detection

This commit is contained in:
Johannes Zillmann 2017-02-19 10:20:14 +01:00
parent bed3fd357b
commit 2783d724e5
5 changed files with 129 additions and 27 deletions

View File

@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectFormats from './transformations/DetectFormats.jsx' import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx'; import CombineSameY from './transformations/CombineSameY.jsx';
@ -29,6 +30,7 @@ export default class AppState {
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new DetectPdfBlocks(), new DetectPdfBlocks(),
new DetectTOC(),
new DetectCodeBlocks(), new DetectCodeBlocks(),
// new DetectFormats(), // new DetectFormats(),
// new CombineSameY(), // new CombineSameY(),

View File

@ -1,17 +1,25 @@
import PdfBlock from './BlockPage.jsx'; import PdfBlock from './BlockPage.jsx';
export const CODE_BLOCK = "Code/Quote";
export const HEADLINE1 = "Headline 1"; export const HEADLINE1 = "Headline 1";
export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC";
export function blockToText(block: PdfBlock) { export function blockToText(block: PdfBlock) {
const text = concatTextItems(block);
switch (block.type) { switch (block.type) {
case CODE_BLOCK: case CODE_BLOCK:
return '```\n' + text + '```' return '```\n' + concatTextItems(block) + '```'
case HEADLINE1: case TOC_BLOCK:
return '#' + text; //TODO 2nd level
default: //TODO real links
var text = '';
block.textItems.forEach(item => {
text += '- ' + item.text + '\n';
});
return text; return text;
case HEADLINE1:
return '#' + concatTextItems(block);
default:
return concatTextItems(block);
} }
} }

View File

@ -6,26 +6,32 @@ export default class TextItemCombiner {
constructor(options) { constructor(options) {
this.transformEmphasis = options.transformEmphasis || true; this.transformEmphasis = options.transformEmphasis || true;
console.debug(this.transformEmphasis); this.maxYDerivation = options.transformEmphasis || 3;
} }
// returns a TextItem array new items // returns a TextItem array new items
combine(textItems: TextItem[]) { combine(textItems: TextItem[]) {
const resultItems = []; const resultItems = [];
const groupedItems = groupByFollowingY(textItems); const groupedItems = this.groupByFollowingY(textItems);
groupedItems.forEach(itemGroup => { groupedItems.forEach(itemGroup => {
if (itemGroup.length == 1) { if (itemGroup.length == 1) {
resultItems.push(itemGroup[0]); resultItems.push(itemGroup[0]);
} else { } else {
var text = ''; var text = '';
var maxHeight = 0;
var widthSum = 0;
itemGroup.forEach(item => { itemGroup.forEach(item => {
// item.annotation = REMOVED_ANNOTATION; // item.annotation = REMOVED_ANNOTATION;
// resultItems.push(item); // resultItems.push(item);
text += item.text; text += item.text;
widthSum += item.width;
}); });
//TODO set other elements //TODO set other elements
resultItems.push(new TextItem({ resultItems.push(new TextItem({
...itemGroup[0],
text: text, text: text,
height: maxHeight,
width: widthSum,
})); }));
} }
}); });
@ -36,21 +42,20 @@ export default class TextItemCombiner {
return resultItems; return resultItems;
} }
} groupByFollowingY(textItems) {
const yArrays = [];
function groupByFollowingY(textItems) { var itemsWithSameY = [];
const yArrays = []; var lastItem;
var itemsWithSameY = []; textItems.forEach(item => {
var lastItem; if (itemsWithSameY.length == 0 || Math.abs(lastItem.y - item.y) <= this.maxYDerivation) {
textItems.forEach(item => { itemsWithSameY.push(item);
if (itemsWithSameY.length == 0 || item.y == lastItem.y) { } else {
itemsWithSameY.push(item); yArrays.push(itemsWithSameY);
} else { itemsWithSameY = [item];
yArrays.push(itemsWithSameY); }
itemsWithSameY = [item]; lastItem = item;
} })
lastItem = item; yArrays.push(itemsWithSameY);
}) return yArrays;
yArrays.push(itemsWithSameY); }
return yArrays;
} }

View File

@ -20,8 +20,6 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
</div>; </div>;
} }
// TODO ==> combine quotes follow each other
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedHeight, mostUsedDistance} = parseResult.globals; const {mostUsedHeight, mostUsedDistance} = parseResult.globals;

View File

@ -0,0 +1,89 @@
import React from 'react';
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { TOC_BLOCK } from '../MarkdownElements.jsx';
import Annotation from '../Annotation.jsx';
import { groupByFollowingY } from '../TextItemCombiner.jsx';
import { isNumber, isDigit } from '../../functions.jsx'
//Detect table of contents pages
export default class DetectTOC extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Table of Contents");
}
createSummaryView(parseResult:ParseResult) {
return <div>
Detected
{ ' ' + parseResult.summary.foundTocPages + ' ' } table of content pages.
</div>;
}
transform(parseResult:ParseResult) {
var foundTocPages = 0;
var x = Math.min(12, parseResult.content.length);
const textCombiner = new TextItemCombiner({});
parseResult.content.slice(0, x).forEach(page => {
var linesCount = 0;
var linesWithDigitsCount = 0;
var lineItemsWithDigits = [];
var headlineBlock;
page.blocks.forEach(block => {
var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems);
itemsGroupedByY.forEach(lineItem => {
linesCount++
var lineText = lineItem.text.replace(/\./g, '').trim();
var endsWithDigit = false;
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
lineText = lineText.substring(0, lineText.length - 2);
endsWithDigit = true;
}
lineText = lineText.trim();
if (endsWithDigit) {
linesWithDigitsCount++;
blockHasLinesWithDigits = true;
lineItemsWithDigits.push(new TextItem({
...lineItem,
text: lineText
}));
}
});
if (!blockHasLinesWithDigits) {
if (!headlineBlock) {
headlineBlock = block;
}
}
});
if (linesWithDigitsCount * 100 / linesCount > 75) {
foundTocPages++;
page.blocks.forEach(block => {
if (block !== headlineBlock) {
block.annotation = REMOVED_ANNOTATION;
}
});
page.blocks.push(new PdfBlock({
textItems: lineItemsWithDigits,
type: TOC_BLOCK,
annotation: ADDED_ANNOTATION
}));
}
});
return new ParseResult({
...parseResult,
summary: {
foundTocPages: foundTocPages
}
});
}
}