[WIP] add headers for all Uppercase lines

This commit is contained in:
Johannes Zillmann 2017-03-20 07:10:43 +01:00
parent 07e7fbb505
commit a35ecd28b6
2 changed files with 40 additions and 4 deletions

View File

@ -7,31 +7,43 @@ export default class ElementType extends Enum {
ElementType.initEnum({
H1: {
headline: true,
headlineLevel: 1,
toText(block:TextItemBlock) {
return '# ' + concatTextItems(block.textItems);
}
},
H2: {
headline: true,
headlineLevel: 2,
toText(block:TextItemBlock) {
return '## ' + concatTextItems(block.textItems);
}
},
H3: {
headline: true,
headlineLevel: 3,
toText(block:TextItemBlock) {
return '### ' + concatTextItems(block.textItems);
}
},
H4: {
headline: true,
headlineLevel: 4,
toText(block:TextItemBlock) {
return '#### ' + concatTextItems(block.textItems);
}
},
H5: {
headline: true,
headlineLevel: 5,
toText(block:TextItemBlock) {
return '##### ' + concatTextItems(block.textItems);
}
},
H6: {
headline: true,
headlineLevel: 6,
toText(block:TextItemBlock) {
return '###### ' + concatTextItems(block.textItems);
}

View File

@ -12,7 +12,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
}
transform(parseResult:ParseResult) {
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals;
const hasToc = tocPages.length > 0;
var detectedHeaders = 0;
@ -34,8 +34,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
});
});
if (hasToc) {
//Use existing headline heights to find additional headlines
if (hasToc) { //Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType];
@ -52,7 +51,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
}
});
} else {
} else { //Categorize headlines by the text heights
const heights = [];
var lastHeight;
parseResult.pages.forEach(page => {
@ -80,6 +79,31 @@ export default class DetectHeaders extends ToTextItemTransformation {
});
}
//find headlines which have paragraph height
var smallesHeadlineLevel = 1;
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (textItem.type && textItem.type.headline) {
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
}
});
});
if (smallesHeadlineLevel < 6) {
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type
&& textItem.height == mostUsedHeight
&& textItem.font !== mostUsedFont
&& textItem.text === textItem.text.toUpperCase()
) {
detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = nextHeadlineType;
}
});
});
}
return new ParseResult({