mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
[WIP] robustify TOC headline finding
This commit is contained in:
parent
dbd9d8bf5f
commit
a9b851ceb6
@ -113,21 +113,22 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Add linked headers
|
// Add linked headers
|
||||||
|
const pageMapping = detectPageMappingNumber(parseResult.pages.filter(page => page.index > lastTocPage.index), tocLinks);
|
||||||
tocLinks.forEach(tocLink => {
|
tocLinks.forEach(tocLink => {
|
||||||
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
|
||||||
var foundHeadline = false;
|
var foundHealineItems;
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
||||||
if (!foundHeadline) { // pages are off by 1 ?
|
if (!foundHealineItems) { // pages are off by 1 ?
|
||||||
linkedPage = parseResult.pages[tocLink.pageNumber];
|
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
//TODO sometimes pages are off. We could try the page range from pre to next ...
|
|
||||||
}
|
}
|
||||||
if (!foundHeadline) {
|
if (foundHealineItems) {
|
||||||
|
addHeadlineItems(linkedPage, tocLink, foundHealineItems, headlineTypeToHeightRange)
|
||||||
|
} else {
|
||||||
notFoundHeadlines.push(tocLink);
|
notFoundHeadlines.push(tocLink);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -192,8 +193,27 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
|
//Find out how the TOC page link actualy translates to the page.index
|
||||||
const headline = tocLink.textItem.text;
|
function detectPageMappingNumber(pages, tocLinks) {
|
||||||
|
for ( var tocLink of tocLinks ) {
|
||||||
|
const page = findPageWithHeadline(pages, tocLink.textItem.text);
|
||||||
|
if (page) {
|
||||||
|
return page.index - tocLink.pageNumber;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findPageWithHeadline(pages, headline) {
|
||||||
|
for ( var page of pages ) {
|
||||||
|
if (findHeadlineItems(page, headline)) {
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findHeadlineItems(page, headline) {
|
||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: headline
|
headline: headline
|
||||||
});
|
});
|
||||||
@ -201,32 +221,38 @@ function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
|
|||||||
for ( var line of page.items ) {
|
for ( var line of page.items ) {
|
||||||
const headlineItems = headlineFinder.consume(line);
|
const headlineItems = headlineFinder.consume(line);
|
||||||
if (headlineItems) {
|
if (headlineItems) {
|
||||||
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
return {
|
||||||
const headlineType = headlineByLevel(tocLink.level + 2);
|
lineIndex: lineIndex,
|
||||||
const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
headlineItems: headlineItems
|
||||||
page.items.splice(lineIndex + 1, 0, new TextItem({
|
};
|
||||||
...headlineItems[0],
|
|
||||||
text: headline,
|
|
||||||
height: headlineHeight,
|
|
||||||
type: headlineType,
|
|
||||||
annotation: ADDED_ANNOTATION
|
|
||||||
}));
|
|
||||||
var range = headlineTypeToHeightRange[headlineType];
|
|
||||||
if (range) {
|
|
||||||
range.min = Math.min(range.min, headlineHeight);
|
|
||||||
range.max = Math.max(range.max, headlineHeight);
|
|
||||||
} else {
|
|
||||||
range = {
|
|
||||||
min: headlineHeight,
|
|
||||||
max: headlineHeight
|
|
||||||
};
|
|
||||||
headlineTypeToHeightRange[headlineType] = range;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
lineIndex++;
|
lineIndex++;
|
||||||
}
|
}
|
||||||
return false;
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) {
|
||||||
|
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||||
|
const headlineType = headlineByLevel(tocLink.level + 2);
|
||||||
|
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
||||||
|
page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({
|
||||||
|
...foundItems.headlineItems[0],
|
||||||
|
text: tocLink.textItem.text,
|
||||||
|
height: headlineHeight,
|
||||||
|
type: headlineType,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
|
if (range) {
|
||||||
|
range.min = Math.min(range.min, headlineHeight);
|
||||||
|
range.max = Math.max(range.max, headlineHeight);
|
||||||
|
} else {
|
||||||
|
range = {
|
||||||
|
min: headlineHeight,
|
||||||
|
max: headlineHeight
|
||||||
|
};
|
||||||
|
headlineTypeToHeightRange[headlineType] = range;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
||||||
|
Loading…
Reference in New Issue
Block a user