mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-26 04:21:40 +02:00
[WIP] detect more headlines with already detected heights
This commit is contained in:
parent
a9b851ceb6
commit
1eda51c0b4
@ -14,70 +14,37 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
// analyse existing headers from TOC detection
|
const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals;
|
||||||
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
|
||||||
|
var detectedHeaders = 0;
|
||||||
|
|
||||||
|
if (tocPages.length > 0) {
|
||||||
|
|
||||||
|
//apply existing headline heights to find additional headlines
|
||||||
|
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||||
|
headlineTypes.forEach(headlineType => {
|
||||||
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
|
// if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
|
||||||
|
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(textItem => {
|
||||||
if (isHeadline(textItem.type)) {
|
if (!textItem.type && textItem.height == range.max) {
|
||||||
var range = headlineTypeToHeightRange[textItem.type];
|
textItem.annotation = DETECTED_ANNOTATION;
|
||||||
if (range) {
|
textItem.type = ElementType.enumValueOf(headlineType);
|
||||||
range.min = Math.min(range.min, textItem.height);
|
detectedHeaders++
|
||||||
range.max = Math.max(range.max, textItem.height);
|
|
||||||
} else {
|
|
||||||
range = {
|
|
||||||
min: textItem.height,
|
|
||||||
max: textItem.height
|
|
||||||
};
|
|
||||||
headlineTypeToHeightRange[textItem.type] = range;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
|
|
||||||
if (existingHeadlineTypes.length > 0) {
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
});
|
||||||
var foundListItems = 0;
|
}
|
||||||
var foundNumberedItems = 0;
|
|
||||||
// parseResult.pages.forEach(page => {
|
|
||||||
// const newTextItems = [];
|
|
||||||
// page.items.forEach(textItem => {
|
|
||||||
// newTextItems.push(textItem);
|
|
||||||
// if (!textItem.type) {
|
|
||||||
// var text = textItem.text;
|
|
||||||
// if (isListItem(text)) {
|
|
||||||
// foundListItems++
|
|
||||||
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
|
||||||
// if (textWithDash === text) {
|
|
||||||
// textItem.annotation = DETECTED_ANNOTATION;
|
|
||||||
// textItem.type = ElementType.LIST;
|
|
||||||
// } else {
|
|
||||||
// textItem.annotation = REMOVED_ANNOTATION;
|
|
||||||
// newTextItems.push(new TextItem({
|
|
||||||
// ...textItem,
|
|
||||||
// text: textWithDash,
|
|
||||||
// annotation: ADDED_ANNOTATION,
|
|
||||||
// type: ElementType.LIST
|
|
||||||
// }));
|
|
||||||
// }
|
|
||||||
// } else if (isNumberedListItem(text)) {
|
|
||||||
// foundNumberedItems++;
|
|
||||||
// textItem.annotation = DETECTED_ANNOTATION;
|
|
||||||
// textItem.type = ElementType.LIST;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// page.items = newTextItems;
|
|
||||||
// });
|
|
||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
messages: [
|
messages: [
|
||||||
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
||||||
'Detected ' + foundNumberedItems + ' numbered list items.'
|
'Detected ' + detectedHeaders + ' headlines.'
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -140,7 +140,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
if (lastNotFound.length > 0) {
|
if (lastNotFound.length > 0) {
|
||||||
lastNotFound.forEach(notFoundTocLink => {
|
lastNotFound.forEach(notFoundTocLink => {
|
||||||
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||||
const heightRange = headlineTypeToHeightRange[headlineType];
|
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
||||||
if (heightRange) {
|
if (heightRange) {
|
||||||
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||||
if (textItem) {
|
if (textItem) {
|
||||||
@ -184,8 +184,8 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
...parseResult,
|
...parseResult,
|
||||||
globals: {
|
globals: {
|
||||||
...parseResult.globals,
|
...parseResult.globals,
|
||||||
tocPages: tocPages
|
tocPages: tocPages,
|
||||||
|
headlineTypeToHeightRange: headlineTypeToHeightRange
|
||||||
},
|
},
|
||||||
messages: messages
|
messages: messages
|
||||||
});
|
});
|
||||||
@ -242,7 +242,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
|||||||
type: headlineType,
|
type: headlineType,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
var range = headlineTypeToHeightRange[headlineType];
|
var range = headlineTypeToHeightRange[headlineType.name];
|
||||||
if (range) {
|
if (range) {
|
||||||
range.min = Math.min(range.min, headlineHeight);
|
range.min = Math.min(range.min, headlineHeight);
|
||||||
range.max = Math.max(range.max, headlineHeight);
|
range.max = Math.max(range.max, headlineHeight);
|
||||||
@ -251,7 +251,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
|||||||
min: headlineHeight,
|
min: headlineHeight,
|
||||||
max: headlineHeight
|
max: headlineHeight
|
||||||
};
|
};
|
||||||
headlineTypeToHeightRange[headlineType] = range;
|
headlineTypeToHeightRange[headlineType.name] = range;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user