[WIP] detect TOC on text items, not on blocks

This commit is contained in:
Johannes Zillmann 2017-03-10 09:52:29 +01:00
parent e2481bdd2a
commit bd4c207ae3
3 changed files with 79 additions and 98 deletions

View File

@ -49,6 +49,9 @@ export default class TextItemTable extends React.Component {
<div style={ { textAlign: 'center' } }> <div style={ { textAlign: 'center' } }>
{ textItem.annotation ? textItem.annotation.category : '' } { textItem.annotation ? textItem.annotation.category : '' }
</div> </div>
<div style={ { textAlign: 'center', color: 'brown' } }>
{ textItem.type ? textItem.type : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }> <div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' } { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }

View File

@ -1,13 +1,13 @@
import { Enum } from 'enumify'; import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx'; import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import CompactLines from './transformations/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import CompactLines from './transformations/CompactLines.jsx'; import DetectTOC from './transformations/DetectTOC.jsx'
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx' import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectHeadlines from './transformations/DetectHeadlines.jsx' import DetectHeadlines from './transformations/DetectHeadlines.jsx'
@ -34,10 +34,10 @@ export default class AppState {
new CompactLines(), new CompactLines(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new DetectTOC(),
new DetectPdfBlocks(), new DetectPdfBlocks(),
new DetectFootnotes(), new DetectFootnotes(),
new DetectTOC(),
new DetectLists(), new DetectLists(),
new DetectCodeBlocks(), new DetectCodeBlocks(),
new DetectHeadlines(), new DetectHeadlines(),

View File

@ -1,106 +1,97 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx'; import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx'; import TextItem from '../TextItem.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx'; import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx'; import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
import { isDigit } from '../../functions.jsx' import { isDigit } from '../../functions.jsx'
//Detect table of contents pages //Detect table of contents pages
export default class DetectTOC extends ToTextItemBlockTransformation { export default class DetectTOC extends ToTextItemTransformation {
constructor() { constructor() {
super("Detect TOC"); super("Detect TOC");
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const tocPages = []; const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
const linkLeveler = new LinkLeveler(); const linkLeveler = new LinkLeveler();
var tocLinks = []; var tocLinks = [];
var lastTocPage; var lastTocPage;
var headlineItem;
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0; const lineItemsWithDigits = [];
var linesWithDigitsCount = 0; const unknownLines = new Set();
var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock;
const pageTocLinks = []; const pageTocLinks = [];
page.items.forEach(block => { var lastLineTextWithoutNumber;
var blockHasLinesWithDigits = false; var lastLine;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; page.items.forEach(line => {
var lastLineTextWithoutNumber; var lineText = line.text.replace(/\./g, '').trim();
itemsGroupedByY.forEach(lineItem => { var endsWithDigit = false;
linesCount++ var digits = [];
var lineText = lineItem.text.replace(/\./g, '').trim(); while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
var endsWithDigit = false; digits.unshift(lineText.charAt(lineText.length - 1));
var digits = []; lineText = lineText.substring(0, lineText.length - 1);
while (isDigit(lineText.charCodeAt(lineText.length - 1))) { endsWithDigit = true;
digits.unshift(lineText.charAt(lineText.length - 1)); }
lineText = lineText.substring(0, lineText.length - 1); lineText = lineText.trim();
endsWithDigit = true; if (endsWithDigit) {
endsWithDigit = true;
if (lastLineTextWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText;
lastLineTextWithoutNumber = null;
} }
lineText = lineText.trim(); pageTocLinks.push(new TocLink({
if (endsWithDigit) { pageNumber: parseInt(digits.join('')),
if (lastLineTextWithoutNumber) { // 2-line item ? textItem: new TextItem({
lineText = lastLineTextWithoutNumber + ' ' + lineText; ...line,
lastLineTextWithoutNumber = null;
}
linesWithDigitsCount++;
blockHasLinesWithDigits = true;
pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')),
textItem: new TextItem({
...lineItem,
text: lineText
})
}));
lineItemsWithDigits.push(new TextItem({
...lineItem,
text: lineText text: lineText
})); })
}));
lineItemsWithDigits.push(new TextItem({
...line,
text: lineText
}));
lastLineTextWithoutNumber = null;
} else {
if (!headlineItem) {
headlineItem = line;
} else { } else {
if (lastLineTextWithoutNumber) {
unknownLines.add(lastLine);
}
lastLineTextWithoutNumber = lineText; lastLineTextWithoutNumber = lineText;
} lastLine = line;
});
if (!blockHasLinesWithDigits) {
if (!headlineBlock) {
headlineBlock = block;
} else {
unknownBlocks.add(block);
} }
} }
}); });
// page has been processed // page has been processed
if (linesWithDigitsCount * 100 / linesCount > 75) { if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
tocPages.push(page.index + 1); tocPages.push(page.index + 1);
lastTocPage = page; lastTocPage = page;
linkLeveler.levelPageItems(pageTocLinks); linkLeveler.levelPageItems(pageTocLinks);
tocLinks = tocLinks.concat(pageTocLinks); tocLinks = tocLinks.concat(pageTocLinks);
const newBlocks = []; const newBlocks = [];
page.items.forEach((block) => { page.items.forEach((line) => {
if (!unknownBlocks.has(block)) { if (!unknownLines.has(line)) {
block.annotation = REMOVED_ANNOTATION; line.annotation = REMOVED_ANNOTATION;
} }
newBlocks.push(block); newBlocks.push(line);
if (block === headlineBlock) { if (line === headlineItem) {
newBlocks.push(new TextItemBlock({ newBlocks.push(new TextItem({
textItems: textCombiner.combine(block.textItems).textItems, ...line,
type: HEADLINE2, type: HEADLINE2,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
})); }));
} }
}); });
page.items = newBlocks; page.items = newBlocks;
} else {
headlineItem = null;
} }
}); });
@ -112,11 +103,11 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false; var foundHeadline = false;
if (linkedPage) { if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); foundHeadline = findHeadline(linkedPage, tocLink);
if (!foundHeadline) { // pages are off by 1 ? if (!foundHeadline) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber]; linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) { if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); foundHeadline = findHeadline(linkedPage, tocLink);
} }
} }
} else { } else {
@ -126,14 +117,13 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
notFoundHeadlines.push(tocLink); notFoundHeadlines.push(tocLink);
} }
}); });
lastTocPage.items.push(new TextItemBlock({ tocLinks.forEach(tocLink => {
textItems: tocLinks.map(tocLink => { lastTocPage.items.push(new TextItem({
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text; text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
return tocLink.textItem type: TOC_BLOCK,
}), annotation: ADDED_ANNOTATION
type: TOC_BLOCK, }));
annotation: ADDED_ANNOTATION });
}));
} }
const messages = []; const messages = [];
@ -157,37 +147,25 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
} }
function findHeadline(page, tocLink, textCombiner) { function findHeadline(page, tocLink) {
const headline = tocLink.textItem.text; const headline = tocLink.textItem.text;
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: headline headline: headline
}); });
var blockIndex = 0; var lineIndex = 0;
var lastBlock; for ( var line of page.items ) {
for ( var block of page.items ) { const headlineItems = headlineFinder.consume(line);
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; if (headlineItems) {
for ( var item of itemsGroupedByY ) { headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
const headlineItems = headlineFinder.consume(item); page.items.splice(lineIndex + 1, 0, new TextItem({
if (headlineItems) { ...headlineItems[0],
const usedItems = headlineFinder.stackedTextItems; text: headline,
block.annotation = REMOVED_ANNOTATION; type: headlineByLevel(tocLink.level + 2),
if (usedItems.length > itemsGroupedByY.length) { annotation: ADDED_ANNOTATION
// 2 line headline }));
lastBlock.annotation = REMOVED_ANNOTATION; return true;
}
page.items.splice(blockIndex + 1, 0, new TextItemBlock({
textItems: [new TextItem({
...usedItems[0],
text: headline
})],
type: headlineByLevel(tocLink.level + 2),
annotation: ADDED_ANNOTATION
}));
return true;
}
} }
blockIndex++; lineIndex++;
lastBlock = block;
} }
return false; return false;
} }