diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx index 879a85e..3581c5d 100644 --- a/src/javascript/functions.jsx +++ b/src/javascript/functions.jsx @@ -62,3 +62,11 @@ export function isListItem(string) { export function isNumberedListItem(string) { return /^[\s]*[\d]*[\.][\s].*$/g.test(string); } + +export function wordMatch(string1, string2) { + const words1 = new Set(string1.toUpperCase().split(' ')); + const words2 = new Set(string2.toUpperCase().split(' ')); + const intersection = new Set( + [...words1].filter(x => words2.has(x))); + return intersection.size / Math.max(words1.size, words2.size); +} diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 05b003d..dc83e4d 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx' -// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' +import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx index abbc684..9bda341 100644 --- a/src/javascript/models/ElementType.jsx +++ b/src/javascript/models/ElementType.jsx @@ -69,6 +69,10 @@ ElementType.initEnum({ } }); +export function isHeadline(elementType: ElementType) { + return elementType && elementType.name.length == 2 && elementType.name[0] === 'H' +} + export function blockToText(block: TextItemBlock) { if (!block.type) { return concatTextItems(block.textItems); diff --git a/src/javascript/models/HeaderLevelAssigner.jsx b/src/javascript/models/HeaderLevelAssigner.jsx deleted file mode 100644 index 81bffb8..0000000 --- a/src/javascript/models/HeaderLevelAssigner.jsx +++ /dev/null @@ -1,28 +0,0 @@ - -// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header -// Levels are from 1..6, where 1 is the biggest headline. -// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given. -export default class HeaderLevelAssigner { - - constructor(options) { - this.startLevel = options.startLevel; - this.paragraphHeight = options.paragraphHeight; - this.lastLevel = null; - this.lastHeight = null; - this.heightToLevel = {}; - } - - add(height) { - if (!this.lastHeight) { - this.lastLevel = this.startLevel; - this.heightToLevel[height] = this.startLevel; - } else { - const existingLevel = this.heightToLevel[height]; - if (!existingLevel) { - // - } - } - - this.lastHeight = height; - } -} \ No newline at end of file diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx new file mode 100644 index 0000000..da03562 --- /dev/null +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -0,0 +1,86 @@ +import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ParseResult from '../../ParseResult.jsx'; +import TextItem from '../../TextItem.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; +import ElementType from '../../ElementType.jsx'; +import { isHeadline, headlineByLevel } from '../../ElementType.jsx'; +import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; + +//Detect items starting with -, •, etc... +export default class DetectListItems extends ToTextItemTransformation { + + constructor() { + super("Detect Headers"); + } + + transform(parseResult:ParseResult) { + // analyse existing headers from TOC detection + const headlineTypeToHeightRange = {}; //H1={min:23, max:25} + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (isHeadline(textItem.type)) { + var range = headlineTypeToHeightRange[textItem.type]; + if (range) { + range.min = Math.min(range.min, textItem.height); + range.max = Math.max(range.max, textItem.height); + } else { + range = { + min: textItem.height, + max: textItem.height + }; + headlineTypeToHeightRange[textItem.type] = range; + } + } + }); + }); + + const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange); + if (existingHeadlineTypes.length > 0) { + + } + + + var foundListItems = 0; + var foundNumberedItems = 0; + // parseResult.pages.forEach(page => { + // const newTextItems = []; + // page.items.forEach(textItem => { + // newTextItems.push(textItem); + // if (!textItem.type) { + // var text = textItem.text; + // if (isListItem(text)) { + // foundListItems++ + // const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length); + // if (textWithDash === text) { + // textItem.annotation = DETECTED_ANNOTATION; + // textItem.type = ElementType.LIST; + // } else { + // textItem.annotation = REMOVED_ANNOTATION; + // newTextItems.push(new TextItem({ + // ...textItem, + // text: textWithDash, + // annotation: ADDED_ANNOTATION, + // type: ElementType.LIST + // })); + // } + // } else if (isNumberedListItem(text)) { + // foundNumberedItems++; + // textItem.annotation = DETECTED_ANNOTATION; + // textItem.type = ElementType.LIST; + // } + // } + // }); + // page.items = newTextItems; + // }); + + return new ParseResult({ + ...parseResult, + messages: [ + 'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange), + 'Detected ' + foundNumberedItems + ' numbered list items.' + ] + }); + + } + +} diff --git a/src/javascript/models/transformations/textitem/DetectTOC.jsx b/src/javascript/models/transformations/textitem/DetectTOC.jsx index 140fc2a..9dfa836 100644 --- a/src/javascript/models/transformations/textitem/DetectTOC.jsx +++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx @@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import TextItem from '../../TextItem.jsx'; import HeadlineFinder from '../../HeadlineFinder.jsx'; -import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx'; -import { isDigit } from '../../../functions.jsx' +import { isDigit, wordMatch } from '../../../functions.jsx' //Detect table of contents pages export default class DetectTOC extends ToTextItemTransformation { @@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation { //all pages have been processed var foundHeadlines = tocLinks.length; const notFoundHeadlines = []; + const foundBySize = []; + const headlineTypeToHeightRange = {}; //H1={min:23, max:25} + if (tocPages.length > 0) { + // Add TOC items + tocLinks.forEach(tocLink => { + lastTocPage.items.push(new TextItem({ + text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, + type: ElementType.TOC, + annotation: ADDED_ANNOTATION + })); + }); + + // Add linked headers tocLinks.forEach(tocLink => { var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var foundHeadline = false; if (linkedPage) { - foundHeadline = findHeadline(linkedPage, tocLink); + foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange); if (!foundHeadline) { // pages are off by 1 ? linkedPage = parseResult.pages[tocLink.pageNumber]; if (linkedPage) { - foundHeadline = findHeadline(linkedPage, tocLink); + foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange); } } } else { @@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation { notFoundHeadlines.push(tocLink); } }); - tocLinks.forEach(tocLink => { - lastTocPage.items.push(new TextItem({ - text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, - type: ElementType.TOC, - annotation: ADDED_ANNOTATION - })); - }); + + // Try to find linked headers by height + var fromPage = lastTocPage.index + 2; + var lastNotFound = []; + const rollupLastNotFound = (currentPageNumber) => { + if (lastNotFound.length > 0) { + lastNotFound.forEach(notFoundTocLink => { + const headlineType = headlineByLevel(notFoundTocLink.level + 2); + const heightRange = headlineTypeToHeightRange[headlineType]; + if (heightRange) { + const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); + if (textItem) { + textItem.type = headlineType; + textItem.annotation = DETECTED_ANNOTATION; + foundBySize.push(textItem.text); + } + } + }); + lastNotFound = []; + } + } + if (notFoundHeadlines.length > 0) { + tocLinks.forEach(tocLink => { + if (notFoundHeadlines.includes(tocLink)) { + lastNotFound.push(tocLink); + } else { + rollupLastNotFound(tocLink.pageNumber); + fromPage = tocLink.pageNumber; + } + }); + if (lastNotFound.length > 0) { + rollupLastNotFound(parseResult.pages.length); + } + } } + + const messages = []; messages.push('Detected ' + tocPages.length + ' table of content pages'); - if (foundHeadlines > 0) { - messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines); + if (tocPages.length > 0) { + messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines); + messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange)); } if (notFoundHeadlines.length > 0) { - messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber)); + messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber)); + messages.push('Found TOC headlines (by size): ' + foundBySize); } return new ParseResult({ ...parseResult, @@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation { } -function findHeadline(page, tocLink) { +function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) { const headline = tocLink.textItem.text; const headlineFinder = new HeadlineFinder({ headline: headline @@ -158,12 +202,26 @@ function findHeadline(page, tocLink) { const headlineItems = headlineFinder.consume(line); if (headlineItems) { headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); + const headlineType = headlineByLevel(tocLink.level + 2); + const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0); page.items.splice(lineIndex + 1, 0, new TextItem({ ...headlineItems[0], text: headline, - type: headlineByLevel(tocLink.level + 2), + height: headlineHeight, + type: headlineType, annotation: ADDED_ANNOTATION })); + var range = headlineTypeToHeightRange[headlineType]; + if (range) { + range.min = Math.min(range.min, headlineHeight); + range.max = Math.max(range.max, headlineHeight); + } else { + range = { + min: headlineHeight, + max: headlineHeight + }; + headlineTypeToHeightRange[headlineType] = range; + } return true; } lineIndex++; @@ -171,6 +229,20 @@ function findHeadline(page, tocLink) { return false; } +function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) { + for (var i = fromPage; i <= toPage; i++) { + const page = pages[i - 1]; + for ( var line of page.items ) { + if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) { + const match = wordMatch(tocLink.textItem.text, line.text); + if (match >= 0.5) { + return line; + } + } + } + } +} + class LinkLeveler { constructor() { diff --git a/test/functions.spec.js b/test/functions.spec.js index b01a85e..35ccbdc 100644 --- a/test/functions.spec.js +++ b/test/functions.spec.js @@ -1,6 +1,6 @@ import { expect } from 'chai'; -import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx' +import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx' describe('hasUpperCaseCharacterInMiddleOfWord', () => { @@ -135,3 +135,20 @@ describe('isNumberedListItem', () => { }); }); + +describe('wordsMatch', () => { + + it('Match', () => { + expect(wordMatch('text 1', 'text 1')).to.equal(1.0); + expect(wordMatch('text 1', 'text 2')).to.equal(0.5); + expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666); + expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5); + expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5); + expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25); + + expect(wordMatch('text', 'test')).to.equal(0.0); + + expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0); + }); + +});