mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-01 03:09:01 +01:00
[WIP] find not found TOC-Headers by size
This commit is contained in:
parent
93f15a38b5
commit
dbd9d8bf5f
@ -62,3 +62,11 @@ export function isListItem(string) {
|
||||
export function isNumberedListItem(string) {
|
||||
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
|
||||
}
|
||||
|
||||
export function wordMatch(string1, string2) {
|
||||
const words1 = new Set(string1.toUpperCase().split(' '));
|
||||
const words2 = new Set(string2.toUpperCase().split(' '));
|
||||
const intersection = new Set(
|
||||
[...words1].filter(x => words2.has(x)));
|
||||
return intersection.size / Math.max(words1.size, words2.size);
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv
|
||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||
// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
|
||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||
|
@ -69,6 +69,10 @@ ElementType.initEnum({
|
||||
}
|
||||
});
|
||||
|
||||
export function isHeadline(elementType: ElementType) {
|
||||
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
||||
}
|
||||
|
||||
export function blockToText(block: TextItemBlock) {
|
||||
if (!block.type) {
|
||||
return concatTextItems(block.textItems);
|
||||
|
@ -1,28 +0,0 @@
|
||||
|
||||
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
|
||||
// Levels are from 1..6, where 1 is the biggest headline.
|
||||
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
|
||||
export default class HeaderLevelAssigner {
|
||||
|
||||
constructor(options) {
|
||||
this.startLevel = options.startLevel;
|
||||
this.paragraphHeight = options.paragraphHeight;
|
||||
this.lastLevel = null;
|
||||
this.lastHeight = null;
|
||||
this.heightToLevel = {};
|
||||
}
|
||||
|
||||
add(height) {
|
||||
if (!this.lastHeight) {
|
||||
this.lastLevel = this.startLevel;
|
||||
this.heightToLevel[height] = this.startLevel;
|
||||
} else {
|
||||
const existingLevel = this.heightToLevel[height];
|
||||
if (!existingLevel) {
|
||||
//
|
||||
}
|
||||
}
|
||||
|
||||
this.lastHeight = height;
|
||||
}
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectListItems extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headers");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
// analyse existing headers from TOC detection
|
||||
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (isHeadline(textItem.type)) {
|
||||
var range = headlineTypeToHeightRange[textItem.type];
|
||||
if (range) {
|
||||
range.min = Math.min(range.min, textItem.height);
|
||||
range.max = Math.max(range.max, textItem.height);
|
||||
} else {
|
||||
range = {
|
||||
min: textItem.height,
|
||||
max: textItem.height
|
||||
};
|
||||
headlineTypeToHeightRange[textItem.type] = range;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
if (existingHeadlineTypes.length > 0) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
var foundListItems = 0;
|
||||
var foundNumberedItems = 0;
|
||||
// parseResult.pages.forEach(page => {
|
||||
// const newTextItems = [];
|
||||
// page.items.forEach(textItem => {
|
||||
// newTextItems.push(textItem);
|
||||
// if (!textItem.type) {
|
||||
// var text = textItem.text;
|
||||
// if (isListItem(text)) {
|
||||
// foundListItems++
|
||||
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||
// if (textWithDash === text) {
|
||||
// textItem.annotation = DETECTED_ANNOTATION;
|
||||
// textItem.type = ElementType.LIST;
|
||||
// } else {
|
||||
// textItem.annotation = REMOVED_ANNOTATION;
|
||||
// newTextItems.push(new TextItem({
|
||||
// ...textItem,
|
||||
// text: textWithDash,
|
||||
// annotation: ADDED_ANNOTATION,
|
||||
// type: ElementType.LIST
|
||||
// }));
|
||||
// }
|
||||
// } else if (isNumberedListItem(text)) {
|
||||
// foundNumberedItems++;
|
||||
// textItem.annotation = DETECTED_ANNOTATION;
|
||||
// textItem.type = ElementType.LIST;
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// page.items = newTextItems;
|
||||
// });
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
||||
'Detected ' + foundNumberedItems + ' numbered list items.'
|
||||
]
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isDigit } from '../../../functions.jsx'
|
||||
import { isDigit, wordMatch } from '../../../functions.jsx'
|
||||
|
||||
//Detect table of contents pages
|
||||
export default class DetectTOC extends ToTextItemTransformation {
|
||||
@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
//all pages have been processed
|
||||
var foundHeadlines = tocLinks.length;
|
||||
const notFoundHeadlines = [];
|
||||
const foundBySize = [];
|
||||
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
||||
|
||||
if (tocPages.length > 0) {
|
||||
// Add TOC items
|
||||
tocLinks.forEach(tocLink => {
|
||||
lastTocPage.items.push(new TextItem({
|
||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||
type: ElementType.TOC,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
});
|
||||
|
||||
// Add linked headers
|
||||
tocLinks.forEach(tocLink => {
|
||||
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
||||
var foundHeadline = false;
|
||||
if (linkedPage) {
|
||||
foundHeadline = findHeadline(linkedPage, tocLink);
|
||||
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
||||
if (!foundHeadline) { // pages are off by 1 ?
|
||||
linkedPage = parseResult.pages[tocLink.pageNumber];
|
||||
if (linkedPage) {
|
||||
foundHeadline = findHeadline(linkedPage, tocLink);
|
||||
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
notFoundHeadlines.push(tocLink);
|
||||
}
|
||||
});
|
||||
tocLinks.forEach(tocLink => {
|
||||
lastTocPage.items.push(new TextItem({
|
||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||
type: ElementType.TOC,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
});
|
||||
|
||||
// Try to find linked headers by height
|
||||
var fromPage = lastTocPage.index + 2;
|
||||
var lastNotFound = [];
|
||||
const rollupLastNotFound = (currentPageNumber) => {
|
||||
if (lastNotFound.length > 0) {
|
||||
lastNotFound.forEach(notFoundTocLink => {
|
||||
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||
const heightRange = headlineTypeToHeightRange[headlineType];
|
||||
if (heightRange) {
|
||||
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||
if (textItem) {
|
||||
textItem.type = headlineType;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
foundBySize.push(textItem.text);
|
||||
}
|
||||
}
|
||||
});
|
||||
lastNotFound = [];
|
||||
}
|
||||
}
|
||||
if (notFoundHeadlines.length > 0) {
|
||||
tocLinks.forEach(tocLink => {
|
||||
if (notFoundHeadlines.includes(tocLink)) {
|
||||
lastNotFound.push(tocLink);
|
||||
} else {
|
||||
rollupLastNotFound(tocLink.pageNumber);
|
||||
fromPage = tocLink.pageNumber;
|
||||
}
|
||||
});
|
||||
if (lastNotFound.length > 0) {
|
||||
rollupLastNotFound(parseResult.pages.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
const messages = [];
|
||||
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
||||
if (foundHeadlines > 0) {
|
||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
|
||||
if (tocPages.length > 0) {
|
||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
||||
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
||||
}
|
||||
if (notFoundHeadlines.length > 0) {
|
||||
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
||||
messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
||||
messages.push('Found TOC headlines (by size): ' + foundBySize);
|
||||
}
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
|
||||
}
|
||||
|
||||
function findHeadline(page, tocLink) {
|
||||
function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
|
||||
const headline = tocLink.textItem.text;
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: headline
|
||||
@ -158,12 +202,26 @@ function findHeadline(page, tocLink) {
|
||||
const headlineItems = headlineFinder.consume(line);
|
||||
if (headlineItems) {
|
||||
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||
const headlineType = headlineByLevel(tocLink.level + 2);
|
||||
const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
||||
page.items.splice(lineIndex + 1, 0, new TextItem({
|
||||
...headlineItems[0],
|
||||
text: headline,
|
||||
type: headlineByLevel(tocLink.level + 2),
|
||||
height: headlineHeight,
|
||||
type: headlineType,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
if (range) {
|
||||
range.min = Math.min(range.min, headlineHeight);
|
||||
range.max = Math.max(range.max, headlineHeight);
|
||||
} else {
|
||||
range = {
|
||||
min: headlineHeight,
|
||||
max: headlineHeight
|
||||
};
|
||||
headlineTypeToHeightRange[headlineType] = range;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
lineIndex++;
|
||||
@ -171,6 +229,20 @@ function findHeadline(page, tocLink) {
|
||||
return false;
|
||||
}
|
||||
|
||||
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
||||
for (var i = fromPage; i <= toPage; i++) {
|
||||
const page = pages[i - 1];
|
||||
for ( var line of page.items ) {
|
||||
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
||||
const match = wordMatch(tocLink.textItem.text, line.text);
|
||||
if (match >= 0.5) {
|
||||
return line;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class LinkLeveler {
|
||||
constructor() {
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||
|
||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
|
||||
@ -135,3 +135,20 @@ describe('isNumberedListItem', () => {
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe('wordsMatch', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
|
||||
expect(wordMatch('text 1', 'text 2')).to.equal(0.5);
|
||||
expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666);
|
||||
expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5);
|
||||
expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5);
|
||||
expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25);
|
||||
|
||||
expect(wordMatch('text', 'test')).to.equal(0.0);
|
||||
|
||||
expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0);
|
||||
});
|
||||
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user