[WIP] add headers for all Uppercase lines

This commit is contained in:
Johannes Zillmann 2017-03-20 07:10:43 +01:00
parent 07e7fbb505
commit a35ecd28b6
2 changed files with 40 additions and 4 deletions

View File

@ -7,31 +7,43 @@ export default class ElementType extends Enum {
ElementType.initEnum({ ElementType.initEnum({
H1: { H1: {
headline: true,
headlineLevel: 1,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '# ' + concatTextItems(block.textItems); return '# ' + concatTextItems(block.textItems);
} }
}, },
H2: { H2: {
headline: true,
headlineLevel: 2,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '## ' + concatTextItems(block.textItems); return '## ' + concatTextItems(block.textItems);
} }
}, },
H3: { H3: {
headline: true,
headlineLevel: 3,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '### ' + concatTextItems(block.textItems); return '### ' + concatTextItems(block.textItems);
} }
}, },
H4: { H4: {
headline: true,
headlineLevel: 4,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '#### ' + concatTextItems(block.textItems); return '#### ' + concatTextItems(block.textItems);
} }
}, },
H5: { H5: {
headline: true,
headlineLevel: 5,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '##### ' + concatTextItems(block.textItems); return '##### ' + concatTextItems(block.textItems);
} }
}, },
H6: { H6: {
headline: true,
headlineLevel: 6,
toText(block:TextItemBlock) { toText(block:TextItemBlock) {
return '###### ' + concatTextItems(block.textItems); return '###### ' + concatTextItems(block.textItems);
} }

View File

@ -12,7 +12,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals; const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals;
const hasToc = tocPages.length > 0; const hasToc = tocPages.length > 0;
var detectedHeaders = 0; var detectedHeaders = 0;
@ -34,8 +34,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
}); });
}); });
if (hasToc) { if (hasToc) { //Use existing headline heights to find additional headlines
//Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange); const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => { headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType]; var range = headlineTypeToHeightRange[headlineType];
@ -52,7 +51,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
} }
}); });
} else { } else { //Categorize headlines by the text heights
const heights = []; const heights = [];
var lastHeight; var lastHeight;
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
@ -80,6 +79,31 @@ export default class DetectHeaders extends ToTextItemTransformation {
}); });
} }
//find headlines which have paragraph height
var smallesHeadlineLevel = 1;
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (textItem.type && textItem.type.headline) {
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
}
});
});
if (smallesHeadlineLevel < 6) {
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type
&& textItem.height == mostUsedHeight
&& textItem.font !== mostUsedFont
&& textItem.text === textItem.text.toUpperCase()
) {
detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = nextHeadlineType;
}
});
});
}
return new ParseResult({ return new ParseResult({