[WIP] headline detection for non TOC pdfs

This commit is contained in:
Johannes Zillmann 2017-03-16 07:38:36 +01:00
parent 77576ebd7e
commit 4600dc6ee7
2 changed files with 53 additions and 40 deletions

View File

@ -123,17 +123,6 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
} }
function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
page.items.forEach(block => {
if (!block.type && block.textItems[0].height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
});
return maxHeaderPagesSet;
}
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
// Find pages with max height // Find pages with max height
const maxHeaderPagesSet = new Set(); const maxHeaderPagesSet = new Set();

View File

@ -1,13 +1,11 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { isHeadline, headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
//Detect items starting with -, , etc... //Detect items starting with -, , etc...
export default class DetectListItems extends ToTextItemTransformation { export default class DetectHeaders extends ToTextItemTransformation {
constructor() { constructor() {
super("Detect Headers"); super("Detect Headers");
@ -15,30 +13,9 @@ export default class DetectListItems extends ToTextItemTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals; const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
const hasToc = tocPages.length > 0;
var detectedHeaders = 0; var detectedHeaders = 0;
if (tocPages.length > 0) {
//Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType];
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == range.max) {
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.enumValueOf(headlineType);
detectedHeaders++
}
});
});
}
});
}
// Handle title pages // Handle title pages
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight); const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
@ -57,11 +34,58 @@ export default class DetectListItems extends ToTextItemTransformation {
}); });
}); });
if (hasToc) {
//Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType];
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == range.max) {
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.enumValueOf(headlineType);
detectedHeaders++
}
});
});
}
});
} else {
const heights = [];
var lastHeight;
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height > mostUsedHeight) {
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
heights.push(textItem.height);
}
}
});
});
heights.sort((a, b) => b - a);
heights.forEach((height, i) => {
const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == height) {
detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = headlineType;
}
});
});
});
}
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: [ messages: [
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange), 'Detected ' + detectedHeaders + ' headlines.',
'Detected ' + detectedHeaders + ' headlines.'
] ]
}); });