[WIP] detect TOC on text items, not on blocks

This commit is contained in:
Johannes Zillmann 2017-03-10 09:52:29 +01:00
parent e2481bdd2a
commit bd4c207ae3
3 changed files with 79 additions and 98 deletions

View File

@ -49,6 +49,9 @@ export default class TextItemTable extends React.Component {
<div style={ { textAlign: 'center' } }>
{ textItem.annotation ? textItem.annotation.category : '' }
</div>
<div style={ { textAlign: 'center', color: 'brown' } }>
{ textItem.type ? textItem.type : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }

View File

@ -1,13 +1,13 @@
import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import CompactLines from './transformations/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import CompactLines from './transformations/CompactLines.jsx';
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
@ -34,10 +34,10 @@ export default class AppState {
new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectTOC(),
new DetectPdfBlocks(),
new DetectFootnotes(),
new DetectTOC(),
new DetectLists(),
new DetectCodeBlocks(),
new DetectHeadlines(),

View File

@ -1,45 +1,34 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
import { isDigit } from '../../functions.jsx'
//Detect table of contents pages
export default class DetectTOC extends ToTextItemBlockTransformation {
export default class DetectTOC extends ToTextItemTransformation {
constructor() {
super("Detect TOC");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
const linkLeveler = new LinkLeveler();
var tocLinks = [];
var lastTocPage;
var headlineItem;
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0;
var linesWithDigitsCount = 0;
var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock;
const lineItemsWithDigits = [];
const unknownLines = new Set();
const pageTocLinks = [];
page.items.forEach(block => {
var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
var lastLineTextWithoutNumber;
itemsGroupedByY.forEach(lineItem => {
linesCount++
var lineText = lineItem.text.replace(/\./g, '').trim();
var lastLine;
page.items.forEach(line => {
var lineText = line.text.replace(/\./g, '').trim();
var endsWithDigit = false;
var digits = [];
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
@ -49,58 +38,60 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
}
lineText = lineText.trim();
if (endsWithDigit) {
endsWithDigit = true;
if (lastLineTextWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText;
lastLineTextWithoutNumber = null;
}
linesWithDigitsCount++;
blockHasLinesWithDigits = true;
pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')),
textItem: new TextItem({
...lineItem,
...line,
text: lineText
})
}));
lineItemsWithDigits.push(new TextItem({
...lineItem,
...line,
text: lineText
}));
lastLineTextWithoutNumber = null;
} else {
lastLineTextWithoutNumber = lineText;
if (!headlineItem) {
headlineItem = line;
} else {
if (lastLineTextWithoutNumber) {
unknownLines.add(lastLine);
}
});
if (!blockHasLinesWithDigits) {
if (!headlineBlock) {
headlineBlock = block;
} else {
unknownBlocks.add(block);
lastLineTextWithoutNumber = lineText;
lastLine = line;
}
}
});
// page has been processed
if (linesWithDigitsCount * 100 / linesCount > 75) {
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
tocPages.push(page.index + 1);
lastTocPage = page;
linkLeveler.levelPageItems(pageTocLinks);
tocLinks = tocLinks.concat(pageTocLinks);
const newBlocks = [];
page.items.forEach((block) => {
if (!unknownBlocks.has(block)) {
block.annotation = REMOVED_ANNOTATION;
page.items.forEach((line) => {
if (!unknownLines.has(line)) {
line.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(block);
if (block === headlineBlock) {
newBlocks.push(new TextItemBlock({
textItems: textCombiner.combine(block.textItems).textItems,
newBlocks.push(line);
if (line === headlineItem) {
newBlocks.push(new TextItem({
...line,
type: HEADLINE2,
annotation: ADDED_ANNOTATION
}));
}
});
page.items = newBlocks;
} else {
headlineItem = null;
}
});
@ -112,11 +103,11 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false;
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
foundHeadline = findHeadline(linkedPage, tocLink);
if (!foundHeadline) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
foundHeadline = findHeadline(linkedPage, tocLink);
}
}
} else {
@ -126,14 +117,13 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
notFoundHeadlines.push(tocLink);
}
});
lastTocPage.items.push(new TextItemBlock({
textItems: tocLinks.map(tocLink => {
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
return tocLink.textItem
}),
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
type: TOC_BLOCK,
annotation: ADDED_ANNOTATION
}));
});
}
const messages = [];
@ -157,37 +147,25 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
}
function findHeadline(page, tocLink, textCombiner) {
function findHeadline(page, tocLink) {
const headline = tocLink.textItem.text;
const headlineFinder = new HeadlineFinder({
headline: headline
});
var blockIndex = 0;
var lastBlock;
for ( var block of page.items ) {
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
for ( var item of itemsGroupedByY ) {
const headlineItems = headlineFinder.consume(item);
var lineIndex = 0;
for ( var line of page.items ) {
const headlineItems = headlineFinder.consume(line);
if (headlineItems) {
const usedItems = headlineFinder.stackedTextItems;
block.annotation = REMOVED_ANNOTATION;
if (usedItems.length > itemsGroupedByY.length) {
// 2 line headline
lastBlock.annotation = REMOVED_ANNOTATION;
}
page.items.splice(blockIndex + 1, 0, new TextItemBlock({
textItems: [new TextItem({
...usedItems[0],
text: headline
})],
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
page.items.splice(lineIndex + 1, 0, new TextItem({
...headlineItems[0],
text: headline,
type: headlineByLevel(tocLink.level + 2),
annotation: ADDED_ANNOTATION
}));
return true;
}
}
blockIndex++;
lastBlock = block;
lineIndex++;
}
return false;
}