mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-07-16 05:55:14 +02:00
249 lines
8.5 KiB
JavaScript
249 lines
8.5 KiB
JavaScript
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
|
import ParseResult from '../../ParseResult.jsx';
|
|
import TextItem from '../../TextItem.jsx';
|
|
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
|
import ElementType from '../../ElementType.jsx';
|
|
import { headlineByLevel } from '../../ElementType.jsx';
|
|
import { isDigit } from '../../../functions.jsx'
|
|
|
|
//Detect table of contents pages
|
|
export default class DetectTOC extends ToTextItemTransformation {
|
|
|
|
constructor() {
|
|
super("Detect TOC");
|
|
}
|
|
|
|
transform(parseResult:ParseResult) {
|
|
const tocPages = [];
|
|
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
|
|
|
const linkLeveler = new LinkLeveler();
|
|
var tocLinks = [];
|
|
var lastTocPage;
|
|
var headlineItem;
|
|
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
|
const lineItemsWithDigits = [];
|
|
const unknownLines = new Set();
|
|
const pageTocLinks = [];
|
|
var lastLineTextWithoutNumber;
|
|
var lastLine;
|
|
page.items.forEach(line => {
|
|
var lineText = line.text.replace(/\./g, '').trim();
|
|
var endsWithDigit = false;
|
|
var digits = [];
|
|
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
|
digits.unshift(lineText.charAt(lineText.length - 1));
|
|
lineText = lineText.substring(0, lineText.length - 1);
|
|
endsWithDigit = true;
|
|
}
|
|
lineText = lineText.trim();
|
|
if (endsWithDigit) {
|
|
endsWithDigit = true;
|
|
if (lastLineTextWithoutNumber) { // 2-line item ?
|
|
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
|
lastLineTextWithoutNumber = null;
|
|
}
|
|
pageTocLinks.push(new TocLink({
|
|
pageNumber: parseInt(digits.join('')),
|
|
textItem: new TextItem({
|
|
...line,
|
|
text: lineText
|
|
})
|
|
}));
|
|
lineItemsWithDigits.push(new TextItem({
|
|
...line,
|
|
text: lineText
|
|
}));
|
|
lastLineTextWithoutNumber = null;
|
|
} else {
|
|
if (!headlineItem) {
|
|
headlineItem = line;
|
|
} else {
|
|
if (lastLineTextWithoutNumber) {
|
|
unknownLines.add(lastLine);
|
|
}
|
|
lastLineTextWithoutNumber = lineText;
|
|
lastLine = line;
|
|
}
|
|
}
|
|
});
|
|
|
|
// page has been processed
|
|
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
|
|
tocPages.push(page.index + 1);
|
|
lastTocPage = page;
|
|
linkLeveler.levelPageItems(pageTocLinks);
|
|
tocLinks = tocLinks.concat(pageTocLinks);
|
|
|
|
const newBlocks = [];
|
|
page.items.forEach((line) => {
|
|
if (!unknownLines.has(line)) {
|
|
line.annotation = REMOVED_ANNOTATION;
|
|
}
|
|
newBlocks.push(line);
|
|
if (line === headlineItem) {
|
|
newBlocks.push(new TextItem({
|
|
...line,
|
|
type: ElementType.H2,
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
}
|
|
});
|
|
page.items = newBlocks;
|
|
} else {
|
|
headlineItem = null;
|
|
}
|
|
});
|
|
|
|
//all pages have been processed
|
|
var foundHeadlines = tocLinks.length;
|
|
const notFoundHeadlines = [];
|
|
if (tocPages.length > 0) {
|
|
tocLinks.forEach(tocLink => {
|
|
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
|
var foundHeadline = false;
|
|
if (linkedPage) {
|
|
foundHeadline = findHeadline(linkedPage, tocLink);
|
|
if (!foundHeadline) { // pages are off by 1 ?
|
|
linkedPage = parseResult.pages[tocLink.pageNumber];
|
|
if (linkedPage) {
|
|
foundHeadline = findHeadline(linkedPage, tocLink);
|
|
}
|
|
}
|
|
} else {
|
|
//TODO sometimes pages are off. We could try the page range from pre to next ...
|
|
}
|
|
if (!foundHeadline) {
|
|
notFoundHeadlines.push(tocLink);
|
|
}
|
|
});
|
|
tocLinks.forEach(tocLink => {
|
|
lastTocPage.items.push(new TextItem({
|
|
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
|
type: ElementType.TOC,
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
});
|
|
}
|
|
|
|
const messages = [];
|
|
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
|
if (foundHeadlines > 0) {
|
|
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
|
|
}
|
|
if (notFoundHeadlines.length > 0) {
|
|
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
|
}
|
|
return new ParseResult({
|
|
...parseResult,
|
|
globals: {
|
|
...parseResult.globals,
|
|
tocPages: tocPages
|
|
|
|
},
|
|
messages: messages
|
|
});
|
|
}
|
|
|
|
}
|
|
|
|
function findHeadline(page, tocLink) {
|
|
const headline = tocLink.textItem.text;
|
|
const headlineFinder = new HeadlineFinder({
|
|
headline: headline
|
|
});
|
|
var lineIndex = 0;
|
|
for ( var line of page.items ) {
|
|
const headlineItems = headlineFinder.consume(line);
|
|
if (headlineItems) {
|
|
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
|
page.items.splice(lineIndex + 1, 0, new TextItem({
|
|
...headlineItems[0],
|
|
text: headline,
|
|
type: headlineByLevel(tocLink.level + 2),
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
return true;
|
|
}
|
|
lineIndex++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
class LinkLeveler {
|
|
constructor() {
|
|
this.levelByMethod = null;
|
|
this.uniqueFonts = [];
|
|
}
|
|
|
|
levelPageItems(tocLinks:TocLink[]) {
|
|
if (!this.levelByMethod) {
|
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
|
if (uniqueX.length > 1) {
|
|
this.levelByMethod = this.levelByXDiff;
|
|
} else {
|
|
const uniqueFonts = this.calculateUniqueFonts(tocLinks);
|
|
if (uniqueFonts.length > 1) {
|
|
this.uniqueFonts = uniqueFonts;
|
|
this.levelByMethod = this.levelByFont;
|
|
} else {
|
|
this.levelByMethod = this.levelToZero;
|
|
}
|
|
}
|
|
}
|
|
this.levelByMethod(tocLinks);
|
|
}
|
|
|
|
levelByXDiff(tocLinks) {
|
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
|
tocLinks.forEach(link => {
|
|
link.level = uniqueX.indexOf(link.textItem.x);
|
|
});
|
|
}
|
|
|
|
levelByFont(tocLinks) {
|
|
tocLinks.forEach(link => {
|
|
link.level = this.uniqueFonts.indexOf(link.textItem.font);
|
|
});
|
|
}
|
|
|
|
levelToZero(tocLinks) {
|
|
tocLinks.forEach(link => {
|
|
link.level = 0;
|
|
});
|
|
}
|
|
|
|
calculateUniqueX(tocLinks) {
|
|
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
|
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
|
|
return uniquesArray;
|
|
}, []);
|
|
|
|
uniqueX.sort((a, b) => {
|
|
return a - b
|
|
});
|
|
|
|
return uniqueX;
|
|
}
|
|
|
|
calculateUniqueFonts(tocLinks) {
|
|
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
|
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
|
|
return uniquesArray;
|
|
}, []);
|
|
|
|
return uniqueFont;
|
|
}
|
|
|
|
}
|
|
|
|
class TocLink {
|
|
constructor(options) {
|
|
this.textItem = options.textItem;
|
|
this.pageNumber = options.pageNumber;
|
|
this.level = 0;
|
|
}
|
|
}
|