mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-07-13 12:36:23 +02:00
360 lines
13 KiB
JavaScript
360 lines
13 KiB
JavaScript
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
|
import ParseResult from '../../ParseResult.jsx';
|
|
import LineItem from '../../LineItem.jsx';
|
|
import Word from '../../Word.jsx';
|
|
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
|
import BlockType from '../../markdown/BlockType.jsx';
|
|
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
|
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
|
|
|
|
//Detect table of contents pages plus linked headlines
|
|
export default class DetectTOC extends ToLineItemTransformation {
|
|
|
|
constructor() {
|
|
super("Detect TOC");
|
|
}
|
|
|
|
transform(parseResult:ParseResult) {
|
|
const tocPages = [];
|
|
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
|
const linkLeveler = new LinkLeveler();
|
|
|
|
|
|
var tocLinks = [];
|
|
var lastTocPage;
|
|
var headlineItem;
|
|
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
|
var lineItemsWithDigits = 0;
|
|
const unknownLines = new Set();
|
|
const pageTocLinks = [];
|
|
var lastWordsWithoutNumber;
|
|
var lastLine;
|
|
//find lines ending with a number per page
|
|
page.items.forEach(line => {
|
|
var words = line.words.filter(word => !hasOnly(word.string, '.'));
|
|
const digits = [];
|
|
while (words.length > 0 && isNumber(words[words.length - 1].string)) {
|
|
const lastWord = words.pop();
|
|
digits.unshift(lastWord.string);
|
|
}
|
|
|
|
if (digits.length == 0 && words.length > 0) {
|
|
const lastWord = words[words.length - 1];
|
|
while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
|
|
digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
|
|
lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1);
|
|
}
|
|
}
|
|
var endsWithDigit = digits.length > 0;
|
|
if (endsWithDigit) {
|
|
endsWithDigit = true;
|
|
if (lastWordsWithoutNumber) { // 2-line item ?
|
|
words.push(...lastWordsWithoutNumber);
|
|
lastWordsWithoutNumber = null;
|
|
}
|
|
pageTocLinks.push(new TocLink({
|
|
pageNumber: parseInt(digits.join('')),
|
|
lineItem: new LineItem({
|
|
...line,
|
|
words: words
|
|
})
|
|
}));
|
|
lineItemsWithDigits++;
|
|
} else {
|
|
if (!headlineItem) {
|
|
headlineItem = line;
|
|
} else {
|
|
if (lastWordsWithoutNumber) {
|
|
unknownLines.add(lastLine);
|
|
}
|
|
lastWordsWithoutNumber = words;
|
|
lastLine = line;
|
|
}
|
|
}
|
|
});
|
|
|
|
// page has been processed
|
|
if (lineItemsWithDigits * 100 / page.items.length > 75) {
|
|
tocPages.push(page.index + 1);
|
|
lastTocPage = page;
|
|
linkLeveler.levelPageItems(pageTocLinks);
|
|
tocLinks.push(...pageTocLinks);
|
|
|
|
const newBlocks = [];
|
|
page.items.forEach((line) => {
|
|
if (!unknownLines.has(line)) {
|
|
line.annotation = REMOVED_ANNOTATION;
|
|
}
|
|
newBlocks.push(line);
|
|
if (line === headlineItem) {
|
|
newBlocks.push(new LineItem({
|
|
...line,
|
|
type: BlockType.H2,
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
}
|
|
});
|
|
page.items = newBlocks;
|
|
} else {
|
|
headlineItem = null;
|
|
}
|
|
});
|
|
|
|
//all pages have been processed
|
|
var foundHeadlines = tocLinks.length;
|
|
const notFoundHeadlines = [];
|
|
const foundBySize = [];
|
|
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
|
|
|
if (tocPages.length > 0) {
|
|
// Add TOC items
|
|
tocLinks.forEach(tocLink => {
|
|
lastTocPage.items.push(new LineItem({
|
|
words: [new Word({
|
|
string: ' '.repeat(tocLink.level * 3) + '-'
|
|
})].concat(tocLink.lineItem.words),
|
|
type: BlockType.TOC,
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
});
|
|
|
|
// Add linked headers
|
|
const pageMapping = detectPageMappingNumber(parseResult.pages.filter(page => page.index > lastTocPage.index), tocLinks);
|
|
tocLinks.forEach(tocLink => {
|
|
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
|
|
var foundHealineItems;
|
|
if (linkedPage) {
|
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
|
if (!foundHealineItems) { // pages are off by 1 ?
|
|
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
|
|
if (linkedPage) {
|
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
|
}
|
|
}
|
|
}
|
|
if (foundHealineItems) {
|
|
addHeadlineItems(linkedPage, tocLink, foundHealineItems, headlineTypeToHeightRange)
|
|
} else {
|
|
notFoundHeadlines.push(tocLink);
|
|
}
|
|
});
|
|
|
|
// Try to find linked headers by height
|
|
var fromPage = lastTocPage.index + 2;
|
|
var lastNotFound = [];
|
|
const rollupLastNotFound = (currentPageNumber) => {
|
|
if (lastNotFound.length > 0) {
|
|
lastNotFound.forEach(notFoundTocLink => {
|
|
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
|
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
|
if (heightRange) {
|
|
const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
|
if (lineIndex > -1) {
|
|
const page = parseResult.pages[pageIndex];
|
|
page.items[lineIndex].annotation = REMOVED_ANNOTATION;
|
|
page.items.splice(lineIndex + 1, 0, new LineItem({
|
|
...notFoundTocLink.lineItem,
|
|
type: headlineType,
|
|
annotation: ADDED_ANNOTATION,
|
|
}));
|
|
foundBySize.push(notFoundTocLink);
|
|
}
|
|
}
|
|
});
|
|
lastNotFound = [];
|
|
}
|
|
}
|
|
if (notFoundHeadlines.length > 0) {
|
|
tocLinks.forEach(tocLink => {
|
|
if (notFoundHeadlines.includes(tocLink)) {
|
|
lastNotFound.push(tocLink);
|
|
} else {
|
|
rollupLastNotFound(tocLink.pageNumber);
|
|
fromPage = tocLink.pageNumber;
|
|
}
|
|
});
|
|
if (lastNotFound.length > 0) {
|
|
rollupLastNotFound(parseResult.pages.length);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
const messages = [];
|
|
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
|
if (tocPages.length > 0) {
|
|
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
|
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
|
}
|
|
if (notFoundHeadlines.length > 0) {
|
|
messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()));
|
|
messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber));
|
|
}
|
|
return new ParseResult({
|
|
...parseResult,
|
|
globals: {
|
|
...parseResult.globals,
|
|
tocPages: tocPages,
|
|
headlineTypeToHeightRange: headlineTypeToHeightRange
|
|
},
|
|
messages: messages
|
|
});
|
|
}
|
|
|
|
}
|
|
|
|
//Find out how the TOC page link actualy translates to the page.index
|
|
function detectPageMappingNumber(pages, tocLinks) {
|
|
for ( var tocLink of tocLinks ) {
|
|
const page = findPageWithHeadline(pages, tocLink.lineItem.text());
|
|
if (page) {
|
|
return page.index - tocLink.pageNumber;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function findPageWithHeadline(pages, headline) {
|
|
for ( var page of pages ) {
|
|
if (findHeadlineItems(page, headline)) {
|
|
return page;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function findHeadlineItems(page, headline) {
|
|
const headlineFinder = new HeadlineFinder({
|
|
headline: headline
|
|
});
|
|
var lineIndex = 0;
|
|
for ( var line of page.items ) {
|
|
const headlineItems = headlineFinder.consume(line);
|
|
if (headlineItems) {
|
|
return {
|
|
lineIndex: lineIndex,
|
|
headlineItems: headlineItems
|
|
};
|
|
}
|
|
lineIndex++;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) {
|
|
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
|
const headlineType = headlineByLevel(tocLink.level + 2);
|
|
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
|
page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
|
|
...foundItems.headlineItems[0],
|
|
words: tocLink.lineItem.words,
|
|
height: headlineHeight,
|
|
type: headlineType,
|
|
annotation: ADDED_ANNOTATION
|
|
}));
|
|
var range = headlineTypeToHeightRange[headlineType.name];
|
|
if (range) {
|
|
range.min = Math.min(range.min, headlineHeight);
|
|
range.max = Math.max(range.max, headlineHeight);
|
|
} else {
|
|
range = {
|
|
min: headlineHeight,
|
|
max: headlineHeight
|
|
};
|
|
headlineTypeToHeightRange[headlineType.name] = range;
|
|
}
|
|
}
|
|
|
|
function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) {
|
|
const linkText = tocLink.lineItem.text().toUpperCase();
|
|
for (var i = fromPage; i <= toPage; i++) {
|
|
const page = pages[i - 1];
|
|
const lineIndex = page.items.findIndex(line => {
|
|
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
|
const match = wordMatch(linkText, line.text());
|
|
return match >= 0.5;
|
|
}
|
|
return false;
|
|
});
|
|
if (lineIndex > -1) return [i - 1, lineIndex];
|
|
}
|
|
return [-1, -1];
|
|
}
|
|
|
|
class LinkLeveler {
|
|
constructor() {
|
|
this.levelByMethod = null;
|
|
this.uniqueFonts = [];
|
|
}
|
|
|
|
levelPageItems(tocLinks:TocLink[]) {
|
|
if (!this.levelByMethod) {
|
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
|
if (uniqueX.length > 1) {
|
|
this.levelByMethod = this.levelByXDiff;
|
|
} else {
|
|
const uniqueFonts = this.calculateUniqueFonts(tocLinks);
|
|
if (uniqueFonts.length > 1) {
|
|
this.uniqueFonts = uniqueFonts;
|
|
this.levelByMethod = this.levelByFont;
|
|
} else {
|
|
this.levelByMethod = this.levelToZero;
|
|
}
|
|
}
|
|
}
|
|
this.levelByMethod(tocLinks);
|
|
}
|
|
|
|
levelByXDiff(tocLinks) {
|
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
|
tocLinks.forEach(link => {
|
|
link.level = uniqueX.indexOf(link.lineItem.x);
|
|
});
|
|
}
|
|
|
|
levelByFont(tocLinks) {
|
|
tocLinks.forEach(link => {
|
|
link.level = this.uniqueFonts.indexOf(link.lineItem.font);
|
|
});
|
|
}
|
|
|
|
levelToZero(tocLinks) {
|
|
tocLinks.forEach(link => {
|
|
link.level = 0;
|
|
});
|
|
}
|
|
|
|
calculateUniqueX(tocLinks) {
|
|
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
|
if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x);
|
|
return uniquesArray;
|
|
}, []);
|
|
|
|
uniqueX.sort((a, b) => {
|
|
return a - b
|
|
});
|
|
|
|
return uniqueX;
|
|
}
|
|
|
|
calculateUniqueFonts(tocLinks) {
|
|
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
|
if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font);
|
|
return uniquesArray;
|
|
}, []);
|
|
|
|
return uniqueFont;
|
|
}
|
|
|
|
}
|
|
|
|
class TocLink {
|
|
constructor(options) {
|
|
this.lineItem = options.lineItem;
|
|
this.pageNumber = options.pageNumber;
|
|
this.level = 0;
|
|
}
|
|
}
|