Files
pdf-to-markdown/src/javascript/models/transformations/textitem/DetectTOC.jsx

249 lines
8.5 KiB
JavaScript

import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import HeadlineFinder from '../../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx';
import { isDigit } from '../../../functions.jsx'
//Detect table of contents pages
export default class DetectTOC extends ToTextItemTransformation {
constructor() {
super("Detect TOC");
}
transform(parseResult:ParseResult) {
const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const linkLeveler = new LinkLeveler();
var tocLinks = [];
var lastTocPage;
var headlineItem;
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
const lineItemsWithDigits = [];
const unknownLines = new Set();
const pageTocLinks = [];
var lastLineTextWithoutNumber;
var lastLine;
page.items.forEach(line => {
var lineText = line.text.replace(/\./g, '').trim();
var endsWithDigit = false;
var digits = [];
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
digits.unshift(lineText.charAt(lineText.length - 1));
lineText = lineText.substring(0, lineText.length - 1);
endsWithDigit = true;
}
lineText = lineText.trim();
if (endsWithDigit) {
endsWithDigit = true;
if (lastLineTextWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText;
lastLineTextWithoutNumber = null;
}
pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')),
textItem: new TextItem({
...line,
text: lineText
})
}));
lineItemsWithDigits.push(new TextItem({
...line,
text: lineText
}));
lastLineTextWithoutNumber = null;
} else {
if (!headlineItem) {
headlineItem = line;
} else {
if (lastLineTextWithoutNumber) {
unknownLines.add(lastLine);
}
lastLineTextWithoutNumber = lineText;
lastLine = line;
}
}
});
// page has been processed
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
tocPages.push(page.index + 1);
lastTocPage = page;
linkLeveler.levelPageItems(pageTocLinks);
tocLinks = tocLinks.concat(pageTocLinks);
const newBlocks = [];
page.items.forEach((line) => {
if (!unknownLines.has(line)) {
line.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(line);
if (line === headlineItem) {
newBlocks.push(new TextItem({
...line,
type: ElementType.H2,
annotation: ADDED_ANNOTATION
}));
}
});
page.items = newBlocks;
} else {
headlineItem = null;
}
});
//all pages have been processed
var foundHeadlines = tocLinks.length;
const notFoundHeadlines = [];
if (tocPages.length > 0) {
tocLinks.forEach(tocLink => {
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false;
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink);
if (!foundHeadline) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink);
}
}
} else {
//TODO sometimes pages are off. We could try the page range from pre to next ...
}
if (!foundHeadline) {
notFoundHeadlines.push(tocLink);
}
});
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
type: ElementType.TOC,
annotation: ADDED_ANNOTATION
}));
});
}
const messages = [];
messages.push('Detected ' + tocPages.length + ' table of content pages');
if (foundHeadlines > 0) {
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
}
if (notFoundHeadlines.length > 0) {
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
}
return new ParseResult({
...parseResult,
globals: {
...parseResult.globals,
tocPages: tocPages
},
messages: messages
});
}
}
function findHeadline(page, tocLink) {
const headline = tocLink.textItem.text;
const headlineFinder = new HeadlineFinder({
headline: headline
});
var lineIndex = 0;
for ( var line of page.items ) {
const headlineItems = headlineFinder.consume(line);
if (headlineItems) {
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
page.items.splice(lineIndex + 1, 0, new TextItem({
...headlineItems[0],
text: headline,
type: headlineByLevel(tocLink.level + 2),
annotation: ADDED_ANNOTATION
}));
return true;
}
lineIndex++;
}
return false;
}
class LinkLeveler {
constructor() {
this.levelByMethod = null;
this.uniqueFonts = [];
}
levelPageItems(tocLinks:TocLink[]) {
if (!this.levelByMethod) {
const uniqueX = this.calculateUniqueX(tocLinks);
if (uniqueX.length > 1) {
this.levelByMethod = this.levelByXDiff;
} else {
const uniqueFonts = this.calculateUniqueFonts(tocLinks);
if (uniqueFonts.length > 1) {
this.uniqueFonts = uniqueFonts;
this.levelByMethod = this.levelByFont;
} else {
this.levelByMethod = this.levelToZero;
}
}
}
this.levelByMethod(tocLinks);
}
levelByXDiff(tocLinks) {
const uniqueX = this.calculateUniqueX(tocLinks);
tocLinks.forEach(link => {
link.level = uniqueX.indexOf(link.textItem.x);
});
}
levelByFont(tocLinks) {
tocLinks.forEach(link => {
link.level = this.uniqueFonts.indexOf(link.textItem.font);
});
}
levelToZero(tocLinks) {
tocLinks.forEach(link => {
link.level = 0;
});
}
calculateUniqueX(tocLinks) {
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
return uniquesArray;
}, []);
uniqueX.sort((a, b) => {
return a - b
});
return uniqueX;
}
calculateUniqueFonts(tocLinks) {
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
return uniquesArray;
}, []);
return uniqueFont;
}
}
class TocLink {
constructor(options) {
this.textItem = options.textItem;
this.pageNumber = options.pageNumber;
this.level = 0;
}
}