diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx index 9bda341..c855d15 100644 --- a/src/javascript/models/ElementType.jsx +++ b/src/javascript/models/ElementType.jsx @@ -7,31 +7,43 @@ export default class ElementType extends Enum { ElementType.initEnum({ H1: { + headline: true, + headlineLevel: 1, toText(block:TextItemBlock) { return '# ' + concatTextItems(block.textItems); } }, H2: { + headline: true, + headlineLevel: 2, toText(block:TextItemBlock) { return '## ' + concatTextItems(block.textItems); } }, H3: { + headline: true, + headlineLevel: 3, toText(block:TextItemBlock) { return '### ' + concatTextItems(block.textItems); } }, H4: { + headline: true, + headlineLevel: 4, toText(block:TextItemBlock) { return '#### ' + concatTextItems(block.textItems); } }, H5: { + headline: true, + headlineLevel: 5, toText(block:TextItemBlock) { return '##### ' + concatTextItems(block.textItems); } }, H6: { + headline: true, + headlineLevel: 6, toText(block:TextItemBlock) { return '###### ' + concatTextItems(block.textItems); } diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index 6c226e3..3c6fa56 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -12,7 +12,7 @@ export default class DetectHeaders extends ToTextItemTransformation { } transform(parseResult:ParseResult) { - const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals; + const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals; const hasToc = tocPages.length > 0; var detectedHeaders = 0; @@ -34,8 +34,7 @@ export default class DetectHeaders extends ToTextItemTransformation { }); }); - if (hasToc) { - //Use existing headline heights to find additional headlines + if (hasToc) { //Use existing headline heights to find additional headlines const headlineTypes = Object.keys(headlineTypeToHeightRange); headlineTypes.forEach(headlineType => { var range = headlineTypeToHeightRange[headlineType]; @@ -52,7 +51,7 @@ export default class DetectHeaders extends ToTextItemTransformation { } }); - } else { + } else { //Categorize headlines by the text heights const heights = []; var lastHeight; parseResult.pages.forEach(page => { @@ -80,6 +79,31 @@ export default class DetectHeaders extends ToTextItemTransformation { }); } + //find headlines which have paragraph height + var smallesHeadlineLevel = 1; + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (textItem.type && textItem.type.headline) { + smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel); + } + }); + }); + if (smallesHeadlineLevel < 6) { + const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1); + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (!textItem.type + && textItem.height == mostUsedHeight + && textItem.font !== mostUsedFont + && textItem.text === textItem.text.toUpperCase() + ) { + detectedHeaders++; + textItem.annotation = DETECTED_ANNOTATION; + textItem.type = nextHeadlineType; + } + }); + }); + } return new ParseResult({