[WIP] find not found TOC-Headers by size

This commit is contained in:
Johannes Zillmann 2017-03-15 08:42:46 +01:00
parent 93f15a38b5
commit dbd9d8bf5f
7 changed files with 205 additions and 46 deletions

View File

@ -62,3 +62,11 @@ export function isListItem(string) {
export function isNumberedListItem(string) { export function isNumberedListItem(string) {
return /^[\s]*[\d]*[\.][\s].*$/g.test(string); return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
} }
export function wordMatch(string1, string2) {
const words1 = new Set(string1.toUpperCase().split(' '));
const words2 = new Set(string2.toUpperCase().split(' '));
const intersection = new Set(
[...words1].filter(x => words2.has(x)));
return intersection.size / Math.max(words1.size, words2.size);
}

View File

@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx'
// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'

View File

@ -69,6 +69,10 @@ ElementType.initEnum({
} }
}); });
export function isHeadline(elementType: ElementType) {
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
}
export function blockToText(block: TextItemBlock) { export function blockToText(block: TextItemBlock) {
if (!block.type) { if (!block.type) {
return concatTextItems(block.textItems); return concatTextItems(block.textItems);

View File

@ -1,28 +0,0 @@
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
// Levels are from 1..6, where 1 is the biggest headline.
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
export default class HeaderLevelAssigner {
constructor(options) {
this.startLevel = options.startLevel;
this.paragraphHeight = options.paragraphHeight;
this.lastLevel = null;
this.lastHeight = null;
this.heightToLevel = {};
}
add(height) {
if (!this.lastHeight) {
this.lastLevel = this.startLevel;
this.heightToLevel[height] = this.startLevel;
} else {
const existingLevel = this.heightToLevel[height];
if (!existingLevel) {
//
}
}
this.lastHeight = height;
}
}

View File

@ -0,0 +1,86 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
//Detect items starting with -, , etc...
export default class DetectListItems extends ToTextItemTransformation {
constructor() {
super("Detect Headers");
}
transform(parseResult:ParseResult) {
// analyse existing headers from TOC detection
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (isHeadline(textItem.type)) {
var range = headlineTypeToHeightRange[textItem.type];
if (range) {
range.min = Math.min(range.min, textItem.height);
range.max = Math.max(range.max, textItem.height);
} else {
range = {
min: textItem.height,
max: textItem.height
};
headlineTypeToHeightRange[textItem.type] = range;
}
}
});
});
const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
if (existingHeadlineTypes.length > 0) {
}
var foundListItems = 0;
var foundNumberedItems = 0;
// parseResult.pages.forEach(page => {
// const newTextItems = [];
// page.items.forEach(textItem => {
// newTextItems.push(textItem);
// if (!textItem.type) {
// var text = textItem.text;
// if (isListItem(text)) {
// foundListItems++
// const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
// if (textWithDash === text) {
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// } else {
// textItem.annotation = REMOVED_ANNOTATION;
// newTextItems.push(new TextItem({
// ...textItem,
// text: textWithDash,
// annotation: ADDED_ANNOTATION,
// type: ElementType.LIST
// }));
// }
// } else if (isNumberedListItem(text)) {
// foundNumberedItems++;
// textItem.annotation = DETECTED_ANNOTATION;
// textItem.type = ElementType.LIST;
// }
// }
// });
// page.items = newTextItems;
// });
return new ParseResult({
...parseResult,
messages: [
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
'Detected ' + foundNumberedItems + ' numbered list items.'
]
});
}
}

View File

@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx'; import TextItem from '../../TextItem.jsx';
import HeadlineFinder from '../../HeadlineFinder.jsx'; import HeadlineFinder from '../../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isDigit } from '../../../functions.jsx' import { isDigit, wordMatch } from '../../../functions.jsx'
//Detect table of contents pages //Detect table of contents pages
export default class DetectTOC extends ToTextItemTransformation { export default class DetectTOC extends ToTextItemTransformation {
@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation {
//all pages have been processed //all pages have been processed
var foundHeadlines = tocLinks.length; var foundHeadlines = tocLinks.length;
const notFoundHeadlines = []; const notFoundHeadlines = [];
const foundBySize = [];
const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
if (tocPages.length > 0) { if (tocPages.length > 0) {
// Add TOC items
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
type: ElementType.TOC,
annotation: ADDED_ANNOTATION
}));
});
// Add linked headers
tocLinks.forEach(tocLink => { tocLinks.forEach(tocLink => {
var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false; var foundHeadline = false;
if (linkedPage) { if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink); foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
if (!foundHeadline) { // pages are off by 1 ? if (!foundHeadline) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber]; linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) { if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink); foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
} }
} }
} else { } else {
@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation {
notFoundHeadlines.push(tocLink); notFoundHeadlines.push(tocLink);
} }
}); });
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({ // Try to find linked headers by height
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, var fromPage = lastTocPage.index + 2;
type: ElementType.TOC, var lastNotFound = [];
annotation: ADDED_ANNOTATION const rollupLastNotFound = (currentPageNumber) => {
})); if (lastNotFound.length > 0) {
}); lastNotFound.forEach(notFoundTocLink => {
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
const heightRange = headlineTypeToHeightRange[headlineType];
if (heightRange) {
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
if (textItem) {
textItem.type = headlineType;
textItem.annotation = DETECTED_ANNOTATION;
foundBySize.push(textItem.text);
}
}
});
lastNotFound = [];
}
}
if (notFoundHeadlines.length > 0) {
tocLinks.forEach(tocLink => {
if (notFoundHeadlines.includes(tocLink)) {
lastNotFound.push(tocLink);
} else {
rollupLastNotFound(tocLink.pageNumber);
fromPage = tocLink.pageNumber;
}
});
if (lastNotFound.length > 0) {
rollupLastNotFound(parseResult.pages.length);
}
}
} }
const messages = []; const messages = [];
messages.push('Detected ' + tocPages.length + ' table of content pages'); messages.push('Detected ' + tocPages.length + ' table of content pages');
if (foundHeadlines > 0) { if (tocPages.length > 0) {
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines); messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
} }
if (notFoundHeadlines.length > 0) { if (notFoundHeadlines.length > 0) {
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber)); messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
messages.push('Found TOC headlines (by size): ' + foundBySize);
} }
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation {
} }
function findHeadline(page, tocLink) { function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
const headline = tocLink.textItem.text; const headline = tocLink.textItem.text;
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: headline headline: headline
@ -158,12 +202,26 @@ function findHeadline(page, tocLink) {
const headlineItems = headlineFinder.consume(line); const headlineItems = headlineFinder.consume(line);
if (headlineItems) { if (headlineItems) {
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
const headlineType = headlineByLevel(tocLink.level + 2);
const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
page.items.splice(lineIndex + 1, 0, new TextItem({ page.items.splice(lineIndex + 1, 0, new TextItem({
...headlineItems[0], ...headlineItems[0],
text: headline, text: headline,
type: headlineByLevel(tocLink.level + 2), height: headlineHeight,
type: headlineType,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
})); }));
var range = headlineTypeToHeightRange[headlineType];
if (range) {
range.min = Math.min(range.min, headlineHeight);
range.max = Math.max(range.max, headlineHeight);
} else {
range = {
min: headlineHeight,
max: headlineHeight
};
headlineTypeToHeightRange[headlineType] = range;
}
return true; return true;
} }
lineIndex++; lineIndex++;
@ -171,6 +229,20 @@ function findHeadline(page, tocLink) {
return false; return false;
} }
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
for (var i = fromPage; i <= toPage; i++) {
const page = pages[i - 1];
for ( var line of page.items ) {
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
const match = wordMatch(tocLink.textItem.text, line.text);
if (match >= 0.5) {
return line;
}
}
}
}
}
class LinkLeveler { class LinkLeveler {
constructor() { constructor() {

View File

@ -1,6 +1,6 @@
import { expect } from 'chai'; import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx' import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
describe('hasUpperCaseCharacterInMiddleOfWord', () => { describe('hasUpperCaseCharacterInMiddleOfWord', () => {
@ -135,3 +135,20 @@ describe('isNumberedListItem', () => {
}); });
}); });
describe('wordsMatch', () => {
it('Match', () => {
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
expect(wordMatch('text 1', 'text 2')).to.equal(0.5);
expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666);
expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5);
expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5);
expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25);
expect(wordMatch('text', 'test')).to.equal(0.0);
expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0);
});
});