mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
[WIP] add headers for all Uppercase lines
This commit is contained in:
parent
07e7fbb505
commit
a35ecd28b6
@ -7,31 +7,43 @@ export default class ElementType extends Enum {
|
||||
|
||||
ElementType.initEnum({
|
||||
H1: {
|
||||
headline: true,
|
||||
headlineLevel: 1,
|
||||
toText(block:TextItemBlock) {
|
||||
return '# ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H2: {
|
||||
headline: true,
|
||||
headlineLevel: 2,
|
||||
toText(block:TextItemBlock) {
|
||||
return '## ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H3: {
|
||||
headline: true,
|
||||
headlineLevel: 3,
|
||||
toText(block:TextItemBlock) {
|
||||
return '### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H4: {
|
||||
headline: true,
|
||||
headlineLevel: 4,
|
||||
toText(block:TextItemBlock) {
|
||||
return '#### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H5: {
|
||||
headline: true,
|
||||
headlineLevel: 5,
|
||||
toText(block:TextItemBlock) {
|
||||
return '##### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H6: {
|
||||
headline: true,
|
||||
headlineLevel: 6,
|
||||
toText(block:TextItemBlock) {
|
||||
return '###### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
|
||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals;
|
||||
const hasToc = tocPages.length > 0;
|
||||
var detectedHeaders = 0;
|
||||
|
||||
@ -34,8 +34,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
});
|
||||
});
|
||||
|
||||
if (hasToc) {
|
||||
//Use existing headline heights to find additional headlines
|
||||
if (hasToc) { //Use existing headline heights to find additional headlines
|
||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
headlineTypes.forEach(headlineType => {
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
@ -52,7 +51,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
}
|
||||
|
||||
});
|
||||
} else {
|
||||
} else { //Categorize headlines by the text heights
|
||||
const heights = [];
|
||||
var lastHeight;
|
||||
parseResult.pages.forEach(page => {
|
||||
@ -80,6 +79,31 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
});
|
||||
}
|
||||
|
||||
//find headlines which have paragraph height
|
||||
var smallesHeadlineLevel = 1;
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (textItem.type && textItem.type.headline) {
|
||||
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
|
||||
}
|
||||
});
|
||||
});
|
||||
if (smallesHeadlineLevel < 6) {
|
||||
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type
|
||||
&& textItem.height == mostUsedHeight
|
||||
&& textItem.font !== mostUsedFont
|
||||
&& textItem.text === textItem.text.toUpperCase()
|
||||
) {
|
||||
detectedHeaders++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = nextHeadlineType;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
return new ParseResult({
|
||||
|
Loading…
Reference in New Issue
Block a user