mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
[WIP] find not found TOC-Headers by size
This commit is contained in:
parent
93f15a38b5
commit
dbd9d8bf5f
@ -62,3 +62,11 @@ export function isListItem(string) {
|
|||||||
export function isNumberedListItem(string) {
|
export function isNumberedListItem(string) {
|
||||||
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
|
return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function wordMatch(string1, string2) {
|
||||||
|
const words1 = new Set(string1.toUpperCase().split(' '));
|
||||||
|
const words2 = new Set(string2.toUpperCase().split(' '));
|
||||||
|
const intersection = new Set(
|
||||||
|
[...words1].filter(x => words2.has(x)));
|
||||||
|
return intersection.size / Math.max(words1.size, words2.size);
|
||||||
|
}
|
||||||
|
@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv
|
|||||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||||
// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||||
|
|
||||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||||
|
@ -69,6 +69,10 @@ ElementType.initEnum({
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
export function isHeadline(elementType: ElementType) {
|
||||||
|
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
||||||
|
}
|
||||||
|
|
||||||
export function blockToText(block: TextItemBlock) {
|
export function blockToText(block: TextItemBlock) {
|
||||||
if (!block.type) {
|
if (!block.type) {
|
||||||
return concatTextItems(block.textItems);
|
return concatTextItems(block.textItems);
|
||||||
|
@ -1,28 +0,0 @@
|
|||||||
|
|
||||||
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
|
|
||||||
// Levels are from 1..6, where 1 is the biggest headline.
|
|
||||||
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
|
|
||||||
export default class HeaderLevelAssigner {
|
|
||||||
|
|
||||||
constructor(options) {
|
|
||||||
this.startLevel = options.startLevel;
|
|
||||||
this.paragraphHeight = options.paragraphHeight;
|
|
||||||
this.lastLevel = null;
|
|
||||||
this.lastHeight = null;
|
|
||||||
this.heightToLevel = {};
|
|
||||||
}
|
|
||||||
|
|
||||||
add(height) {
|
|
||||||
if (!this.lastHeight) {
|
|
||||||
this.lastLevel = this.startLevel;
|
|
||||||
this.heightToLevel[height] = this.startLevel;
|
|
||||||
} else {
|
|
||||||
const existingLevel = this.heightToLevel[height];
|
|
||||||
if (!existingLevel) {
|
|
||||||
//
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.lastHeight = height;
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,86 @@
|
|||||||
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
|
import TextItem from '../../TextItem.jsx';
|
||||||
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
import ElementType from '../../ElementType.jsx';
|
||||||
|
import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
|
||||||
|
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
||||||
|
|
||||||
|
//Detect items starting with -, •, etc...
|
||||||
|
export default class DetectListItems extends ToTextItemTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Headers");
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
// analyse existing headers from TOC detection
|
||||||
|
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
page.items.forEach(textItem => {
|
||||||
|
if (isHeadline(textItem.type)) {
|
||||||
|
var range = headlineTypeToHeightRange[textItem.type];
|
||||||
|
if (range) {
|
||||||
|
range.min = Math.min(range.min, textItem.height);
|
||||||
|
range.max = Math.max(range.max, textItem.height);
|
||||||
|
} else {
|
||||||
|
range = {
|
||||||
|
min: textItem.height,
|
||||||
|
max: textItem.height
|
||||||
|
};
|
||||||
|
headlineTypeToHeightRange[textItem.type] = range;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||||
|
if (existingHeadlineTypes.length > 0) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
var foundListItems = 0;
|
||||||
|
var foundNumberedItems = 0;
|
||||||
|
// parseResult.pages.forEach(page => {
|
||||||
|
// const newTextItems = [];
|
||||||
|
// page.items.forEach(textItem => {
|
||||||
|
// newTextItems.push(textItem);
|
||||||
|
// if (!textItem.type) {
|
||||||
|
// var text = textItem.text;
|
||||||
|
// if (isListItem(text)) {
|
||||||
|
// foundListItems++
|
||||||
|
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||||
|
// if (textWithDash === text) {
|
||||||
|
// textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
// textItem.type = ElementType.LIST;
|
||||||
|
// } else {
|
||||||
|
// textItem.annotation = REMOVED_ANNOTATION;
|
||||||
|
// newTextItems.push(new TextItem({
|
||||||
|
// ...textItem,
|
||||||
|
// text: textWithDash,
|
||||||
|
// annotation: ADDED_ANNOTATION,
|
||||||
|
// type: ElementType.LIST
|
||||||
|
// }));
|
||||||
|
// }
|
||||||
|
// } else if (isNumberedListItem(text)) {
|
||||||
|
// foundNumberedItems++;
|
||||||
|
// textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
// textItem.type = ElementType.LIST;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
// page.items = newTextItems;
|
||||||
|
// });
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
messages: [
|
||||||
|
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
||||||
|
'Detected ' + foundNumberedItems + ' numbered list items.'
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
|||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItem from '../../TextItem.jsx';
|
import TextItem from '../../TextItem.jsx';
|
||||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../ElementType.jsx';
|
||||||
import { isDigit } from '../../../functions.jsx'
|
import { isDigit, wordMatch } from '../../../functions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages
|
//Detect table of contents pages
|
||||||
export default class DetectTOC extends ToTextItemTransformation {
|
export default class DetectTOC extends ToTextItemTransformation {
|
||||||
@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
//all pages have been processed
|
//all pages have been processed
|
||||||
var foundHeadlines = tocLinks.length;
|
var foundHeadlines = tocLinks.length;
|
||||||
const notFoundHeadlines = [];
|
const notFoundHeadlines = [];
|
||||||
|
const foundBySize = [];
|
||||||
|
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
|
||||||
|
|
||||||
if (tocPages.length > 0) {
|
if (tocPages.length > 0) {
|
||||||
|
// Add TOC items
|
||||||
|
tocLinks.forEach(tocLink => {
|
||||||
|
lastTocPage.items.push(new TextItem({
|
||||||
|
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||||
|
type: ElementType.TOC,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add linked headers
|
||||||
tocLinks.forEach(tocLink => {
|
tocLinks.forEach(tocLink => {
|
||||||
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
||||||
var foundHeadline = false;
|
var foundHeadline = false;
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findHeadline(linkedPage, tocLink);
|
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
||||||
if (!foundHeadline) { // pages are off by 1 ?
|
if (!foundHeadline) { // pages are off by 1 ?
|
||||||
linkedPage = parseResult.pages[tocLink.pageNumber];
|
linkedPage = parseResult.pages[tocLink.pageNumber];
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findHeadline(linkedPage, tocLink);
|
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
notFoundHeadlines.push(tocLink);
|
notFoundHeadlines.push(tocLink);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
tocLinks.forEach(tocLink => {
|
|
||||||
lastTocPage.items.push(new TextItem({
|
// Try to find linked headers by height
|
||||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
var fromPage = lastTocPage.index + 2;
|
||||||
type: ElementType.TOC,
|
var lastNotFound = [];
|
||||||
annotation: ADDED_ANNOTATION
|
const rollupLastNotFound = (currentPageNumber) => {
|
||||||
}));
|
if (lastNotFound.length > 0) {
|
||||||
});
|
lastNotFound.forEach(notFoundTocLink => {
|
||||||
|
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||||
|
const heightRange = headlineTypeToHeightRange[headlineType];
|
||||||
|
if (heightRange) {
|
||||||
|
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||||
|
if (textItem) {
|
||||||
|
textItem.type = headlineType;
|
||||||
|
textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
foundBySize.push(textItem.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
lastNotFound = [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (notFoundHeadlines.length > 0) {
|
||||||
|
tocLinks.forEach(tocLink => {
|
||||||
|
if (notFoundHeadlines.includes(tocLink)) {
|
||||||
|
lastNotFound.push(tocLink);
|
||||||
|
} else {
|
||||||
|
rollupLastNotFound(tocLink.pageNumber);
|
||||||
|
fromPage = tocLink.pageNumber;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (lastNotFound.length > 0) {
|
||||||
|
rollupLastNotFound(parseResult.pages.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const messages = [];
|
const messages = [];
|
||||||
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
||||||
if (foundHeadlines > 0) {
|
if (tocPages.length > 0) {
|
||||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
|
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
||||||
|
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
||||||
}
|
}
|
||||||
if (notFoundHeadlines.length > 0) {
|
if (notFoundHeadlines.length > 0) {
|
||||||
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
||||||
|
messages.push('Found TOC headlines (by size): ' + foundBySize);
|
||||||
}
|
}
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function findHeadline(page, tocLink) {
|
function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
|
||||||
const headline = tocLink.textItem.text;
|
const headline = tocLink.textItem.text;
|
||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: headline
|
headline: headline
|
||||||
@ -158,12 +202,26 @@ function findHeadline(page, tocLink) {
|
|||||||
const headlineItems = headlineFinder.consume(line);
|
const headlineItems = headlineFinder.consume(line);
|
||||||
if (headlineItems) {
|
if (headlineItems) {
|
||||||
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||||
|
const headlineType = headlineByLevel(tocLink.level + 2);
|
||||||
|
const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
||||||
page.items.splice(lineIndex + 1, 0, new TextItem({
|
page.items.splice(lineIndex + 1, 0, new TextItem({
|
||||||
...headlineItems[0],
|
...headlineItems[0],
|
||||||
text: headline,
|
text: headline,
|
||||||
type: headlineByLevel(tocLink.level + 2),
|
height: headlineHeight,
|
||||||
|
type: headlineType,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
|
if (range) {
|
||||||
|
range.min = Math.min(range.min, headlineHeight);
|
||||||
|
range.max = Math.max(range.max, headlineHeight);
|
||||||
|
} else {
|
||||||
|
range = {
|
||||||
|
min: headlineHeight,
|
||||||
|
max: headlineHeight
|
||||||
|
};
|
||||||
|
headlineTypeToHeightRange[headlineType] = range;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
lineIndex++;
|
lineIndex++;
|
||||||
@ -171,6 +229,20 @@ function findHeadline(page, tocLink) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
||||||
|
for (var i = fromPage; i <= toPage; i++) {
|
||||||
|
const page = pages[i - 1];
|
||||||
|
for ( var line of page.items ) {
|
||||||
|
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
||||||
|
const match = wordMatch(tocLink.textItem.text, line.text);
|
||||||
|
if (match >= 0.5) {
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class LinkLeveler {
|
class LinkLeveler {
|
||||||
constructor() {
|
constructor() {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { expect } from 'chai';
|
import { expect } from 'chai';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||||
|
|
||||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||||
|
|
||||||
@ -135,3 +135,20 @@ describe('isNumberedListItem', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('wordsMatch', () => {
|
||||||
|
|
||||||
|
it('Match', () => {
|
||||||
|
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
|
||||||
|
expect(wordMatch('text 1', 'text 2')).to.equal(0.5);
|
||||||
|
expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666);
|
||||||
|
expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5);
|
||||||
|
expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5);
|
||||||
|
expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25);
|
||||||
|
|
||||||
|
expect(wordMatch('text', 'test')).to.equal(0.0);
|
||||||
|
|
||||||
|
expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0);
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user