mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-24 00:33:48 +01:00
[WIP] detect more headlines with already detected heights
This commit is contained in:
parent
a9b851ceb6
commit
1eda51c0b4
@ -14,70 +14,37 @@ export default class DetectListItems extends ToTextItemTransformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
// analyse existing headers from TOC detection
|
||||
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (isHeadline(textItem.type)) {
|
||||
var range = headlineTypeToHeightRange[textItem.type];
|
||||
if (range) {
|
||||
range.min = Math.min(range.min, textItem.height);
|
||||
range.max = Math.max(range.max, textItem.height);
|
||||
} else {
|
||||
range = {
|
||||
min: textItem.height,
|
||||
max: textItem.height
|
||||
};
|
||||
headlineTypeToHeightRange[textItem.type] = range;
|
||||
}
|
||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals;
|
||||
|
||||
var detectedHeaders = 0;
|
||||
|
||||
if (tocPages.length > 0) {
|
||||
|
||||
//apply existing headline heights to find additional headlines
|
||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
headlineTypes.forEach(headlineType => {
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
// if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
|
||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == range.max) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.enumValueOf(headlineType);
|
||||
detectedHeaders++
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
if (existingHeadlineTypes.length > 0) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
var foundListItems = 0;
|
||||
var foundNumberedItems = 0;
|
||||
// parseResult.pages.forEach(page => {
|
||||
// const newTextItems = [];
|
||||
// page.items.forEach(textItem => {
|
||||
// newTextItems.push(textItem);
|
||||
// if (!textItem.type) {
|
||||
// var text = textItem.text;
|
||||
// if (isListItem(text)) {
|
||||
// foundListItems++
|
||||
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||
// if (textWithDash === text) {
|
||||
// textItem.annotation = DETECTED_ANNOTATION;
|
||||
// textItem.type = ElementType.LIST;
|
||||
// } else {
|
||||
// textItem.annotation = REMOVED_ANNOTATION;
|
||||
// newTextItems.push(new TextItem({
|
||||
// ...textItem,
|
||||
// text: textWithDash,
|
||||
// annotation: ADDED_ANNOTATION,
|
||||
// type: ElementType.LIST
|
||||
// }));
|
||||
// }
|
||||
// } else if (isNumberedListItem(text)) {
|
||||
// foundNumberedItems++;
|
||||
// textItem.annotation = DETECTED_ANNOTATION;
|
||||
// textItem.type = ElementType.LIST;
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// page.items = newTextItems;
|
||||
// });
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
||||
'Detected ' + foundNumberedItems + ' numbered list items.'
|
||||
'Detected ' + detectedHeaders + ' headlines.'
|
||||
]
|
||||
});
|
||||
|
||||
|
@ -140,7 +140,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
if (lastNotFound.length > 0) {
|
||||
lastNotFound.forEach(notFoundTocLink => {
|
||||
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||
const heightRange = headlineTypeToHeightRange[headlineType];
|
||||
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
||||
if (heightRange) {
|
||||
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||
if (textItem) {
|
||||
@ -184,8 +184,8 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
...parseResult,
|
||||
globals: {
|
||||
...parseResult.globals,
|
||||
tocPages: tocPages
|
||||
|
||||
tocPages: tocPages,
|
||||
headlineTypeToHeightRange: headlineTypeToHeightRange
|
||||
},
|
||||
messages: messages
|
||||
});
|
||||
@ -242,7 +242,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
||||
type: headlineType,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
var range = headlineTypeToHeightRange[headlineType.name];
|
||||
if (range) {
|
||||
range.min = Math.min(range.min, headlineHeight);
|
||||
range.max = Math.max(range.max, headlineHeight);
|
||||
@ -251,7 +251,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
||||
min: headlineHeight,
|
||||
max: headlineHeight
|
||||
};
|
||||
headlineTypeToHeightRange[headlineType] = range;
|
||||
headlineTypeToHeightRange[headlineType.name] = range;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user