[WIP] detect more headlines with already detected heights

This commit is contained in:
Johannes Zillmann 2017-03-16 06:52:45 +01:00
parent a9b851ceb6
commit 1eda51c0b4
2 changed files with 28 additions and 61 deletions

View File

@ -14,70 +14,37 @@ export default class DetectListItems extends ToTextItemTransformation {
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
// analyse existing headers from TOC detection const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals;
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
parseResult.pages.forEach(page => { var detectedHeaders = 0;
page.items.forEach(textItem => {
if (isHeadline(textItem.type)) { if (tocPages.length > 0) {
var range = headlineTypeToHeightRange[textItem.type];
if (range) { //apply existing headline heights to find additional headlines
range.min = Math.min(range.min, textItem.height); const headlineTypes = Object.keys(headlineTypeToHeightRange);
range.max = Math.max(range.max, textItem.height); headlineTypes.forEach(headlineType => {
} else { var range = headlineTypeToHeightRange[headlineType];
range = { // if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
min: textItem.height, if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
max: textItem.height parseResult.pages.forEach(page => {
}; page.items.forEach(textItem => {
headlineTypeToHeightRange[textItem.type] = range; if (!textItem.type && textItem.height == range.max) {
} textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.enumValueOf(headlineType);
detectedHeaders++
}
});
});
} }
}); });
});
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
if (existingHeadlineTypes.length > 0) {
} }
var foundListItems = 0;
var foundNumberedItems = 0;
// parseResult.pages.forEach(page => {
// const newTextItems = [];
// page.items.forEach(textItem => {
// newTextItems.push(textItem);
// if (!textItem.type) {
// var text = textItem.text;
// if (isListItem(text)) {
// foundListItems++
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
// if (textWithDash === text) {
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// } else {
// textItem.annotation = REMOVED_ANNOTATION;
// newTextItems.push(new TextItem({
// ...textItem,
// text: textWithDash,
// annotation: ADDED_ANNOTATION,
// type: ElementType.LIST
// }));
// }
// } else if (isNumberedListItem(text)) {
// foundNumberedItems++;
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// }
// }
// });
// page.items = newTextItems;
// });
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: [ messages: [
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange), 'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
'Detected ' + foundNumberedItems + ' numbered list items.' 'Detected ' + detectedHeaders + ' headlines.'
] ]
}); });

View File

@ -140,7 +140,7 @@ export default class DetectTOC extends ToTextItemTransformation {
if (lastNotFound.length > 0) { if (lastNotFound.length > 0) {
lastNotFound.forEach(notFoundTocLink => { lastNotFound.forEach(notFoundTocLink => {
const headlineType = headlineByLevel(notFoundTocLink.level + 2); const headlineType = headlineByLevel(notFoundTocLink.level + 2);
const heightRange = headlineTypeToHeightRange[headlineType]; const heightRange = headlineTypeToHeightRange[headlineType.name];
if (heightRange) { if (heightRange) {
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
if (textItem) { if (textItem) {
@ -184,8 +184,8 @@ export default class DetectTOC extends ToTextItemTransformation {
...parseResult, ...parseResult,
globals: { globals: {
...parseResult.globals, ...parseResult.globals,
tocPages: tocPages tocPages: tocPages,
headlineTypeToHeightRange: headlineTypeToHeightRange
}, },
messages: messages messages: messages
}); });
@ -242,7 +242,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
type: headlineType, type: headlineType,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
})); }));
var range = headlineTypeToHeightRange[headlineType]; var range = headlineTypeToHeightRange[headlineType.name];
if (range) { if (range) {
range.min = Math.min(range.min, headlineHeight); range.min = Math.min(range.min, headlineHeight);
range.max = Math.max(range.max, headlineHeight); range.max = Math.max(range.max, headlineHeight);
@ -251,7 +251,7 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
min: headlineHeight, min: headlineHeight,
max: headlineHeight max: headlineHeight
}; };
headlineTypeToHeightRange[headlineType] = range; headlineTypeToHeightRange[headlineType.name] = range;
} }
} }