[WIP] detect more headlines with already detected heights

This commit is contained in:
Johannes Zillmann 2017-03-16 06:52:45 +01:00
parent a9b851ceb6
commit 1eda51c0b4
2 changed files with 28 additions and 61 deletions

View File

@ -14,70 +14,37 @@ export default class DetectListItems extends ToTextItemTransformation {
}
transform(parseResult:ParseResult) {
// analyse existing headers from TOC detection
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (isHeadline(textItem.type)) {
var range = headlineTypeToHeightRange[textItem.type];
if (range) {
range.min = Math.min(range.min, textItem.height);
range.max = Math.max(range.max, textItem.height);
} else {
range = {
min: textItem.height,
max: textItem.height
};
headlineTypeToHeightRange[textItem.type] = range;
}
const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals;
var detectedHeaders = 0;
if (tocPages.length > 0) {
//apply existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType];
// if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == range.max) {
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.enumValueOf(headlineType);
detectedHeaders++
}
});
});
}
});
});
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
if (existingHeadlineTypes.length > 0) {
}
var foundListItems = 0;
var foundNumberedItems = 0;
// parseResult.pages.forEach(page => {
// const newTextItems = [];
// page.items.forEach(textItem => {
// newTextItems.push(textItem);
// if (!textItem.type) {
// var text = textItem.text;
// if (isListItem(text)) {
// foundListItems++
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
// if (textWithDash === text) {
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// } else {
// textItem.annotation = REMOVED_ANNOTATION;
// newTextItems.push(new TextItem({
// ...textItem,
// text: textWithDash,
// annotation: ADDED_ANNOTATION,
// type: ElementType.LIST
// }));
// }
// } else if (isNumberedListItem(text)) {
// foundNumberedItems++;
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// }
// }
// });
// page.items = newTextItems;
// });
return new ParseResult({
...parseResult,
messages: [
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
'Detected ' + foundNumberedItems + ' numbered list items.'
'Detected ' + detectedHeaders + ' headlines.'
]
});

View File

@ -140,7 +140,7 @@ export default class DetectTOC extends ToTextItemTransformation {
if (lastNotFound.length > 0) {
lastNotFound.forEach(notFoundTocLink => {
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
const heightRange = headlineTypeToHeightRange[headlineType];
const heightRange = headlineTypeToHeightRange[headlineType.name];
if (heightRange) {
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
if (textItem) {
@ -184,8 +184,8 @@ export default class DetectTOC extends ToTextItemTransformation {
...parseResult,
globals: {
...parseResult.globals,
tocPages: tocPages
tocPages: tocPages,
headlineTypeToHeightRange: headlineTypeToHeightRange
},
messages: messages
});
@ -242,7 +242,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
type: headlineType,
annotation: ADDED_ANNOTATION
}));
var range = headlineTypeToHeightRange[headlineType];
var range = headlineTypeToHeightRange[headlineType.name];
if (range) {
range.min = Math.min(range.min, headlineHeight);
range.max = Math.max(range.max, headlineHeight);
@ -251,7 +251,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
min: headlineHeight,
max: headlineHeight
};
headlineTypeToHeightRange[headlineType] = range;
headlineTypeToHeightRange[headlineType.name] = range;
}
}