diff --git a/package.json b/package.json index 7d87c49..ba460bb 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "watch": "webpack -d --watch", "build": "webpack", "lint": "eslint src --ext .js --ext .jsx --cache", - "test": "mocha --compilers js:babel-core/register test/*.spec.js", + "test": "mocha --compilers js:babel-core/register test --recursive", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "deploy": "npm run release && cp -r build/* docs/" }, diff --git a/src/javascript/components/debug/TextItemBlockPageView.jsx b/src/javascript/components/debug/LineItemBlockPageView.jsx similarity index 84% rename from src/javascript/components/debug/TextItemBlockPageView.jsx rename to src/javascript/components/debug/LineItemBlockPageView.jsx index 261c16f..bd51ed7 100644 --- a/src/javascript/components/debug/TextItemBlockPageView.jsx +++ b/src/javascript/components/debug/LineItemBlockPageView.jsx @@ -1,13 +1,12 @@ import React from 'react'; import PageView from './PageView.jsx'; -import TextItemTable from './TextItemTable.jsx'; +import LineItemTable from './LineItemTable.jsx'; -// View for a Page which items are of kind TextItemBlock -export default class TextItemBlockPageView extends PageView { +// View for a Page which items are of kind LineItemBlock +export default class LineItemBlockPageView extends PageView { createItemViews(items, showWhitespaces) { const blockTables = items.map((block, i) => { - var textItems = block.textItems; const blockType = block.type ? ' - ' + block.type.name : null; const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category } : null; @@ -38,7 +37,7 @@ export default class TextItemBlockPageView extends PageView { Block { i + 1 }{ blockType } { blockAnnotation }
- + { footnoteLinks } { footnotes }
diff --git a/src/javascript/components/debug/LineItemPageView.jsx b/src/javascript/components/debug/LineItemPageView.jsx new file mode 100644 index 0000000..a54abae --- /dev/null +++ b/src/javascript/components/debug/LineItemPageView.jsx @@ -0,0 +1,12 @@ +import React from 'react'; +import PageView from './PageView.jsx'; +import LineItemTable from './LineItemTable.jsx'; + +// View for a Page which items are of kind LineItem +export default class LineItemPageView extends PageView { + + createItemViews(items, showWhitespaces) { + return + } + +} \ No newline at end of file diff --git a/src/javascript/components/debug/LineItemTable.jsx b/src/javascript/components/debug/LineItemTable.jsx new file mode 100644 index 0000000..27eb0eb --- /dev/null +++ b/src/javascript/components/debug/LineItemTable.jsx @@ -0,0 +1,108 @@ +import React from 'react'; + +import Table from 'react-bootstrap/lib/Table' + +// Displays an array of LineItem as a table +export default class LineItemTable extends React.Component { + + static propTypes = { + items: React.PropTypes.array.isRequired, + showWhitespaces: React.PropTypes.bool + }; + + render() { + const {showWhitespaces, items} = this.props; + const tableHeader = + + + # + + + Text + + + X + + + Y + + + Width + + + Height + + + + + const itemRows = items.map((item, i) => + +
+ { i } +
+
+ { item.annotation ? item.annotation.category : '' } +
+
+ { item.type ? item.type.name : '' } +
+
+ { item.parsedElements && item.parsedElements.footnoteLinks.length > 0 ?
+ Footnote-Link +
: '' } + { item.parsedElements && item.parsedElements.containLinks ?
+ Link +
: '' } + { item.lineFormat ?
+ { item.lineFormat.name } +
: '' } + { item.unopenedFormat ?
+ Unopened + { ' ' + item.unopenedFormat.name } +
: '' } + { item.parsedElements && item.parsedElements.inlineFormats > 0 ?
+ { item.parsedElements.inlineFormats + 'x Bold/Italic' } +
: '' } + { item.unclosedFormat ?
+ Unclosed + { ' ' + item.unclosedFormat.name } +
: '' } +
+ + + { showWhitespaces ? ( +
{ item.text() }
+ ) : (item.text()) } + + + { item.x } + + + { item.y } + + + { item.width } + + + { item.height } + + + ) + + return ( + + { tableHeader } + + { itemRows } + +
+ ); + } +} \ No newline at end of file diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx index 53b6a09..1680a69 100644 --- a/src/javascript/functions.jsx +++ b/src/javascript/functions.jsx @@ -18,6 +18,17 @@ export function isNumber(string) { return true; } +export function hasOnly(string, char) { + const charCode = char.charCodeAt(0); + for (var i = 0; i < string.length; i++) { + const aCharCode = string.charCodeAt(i); + if (aCharCode != charCode) { + return false; + } + } + return true; +} + export function hasUpperCaseCharacterInMiddleOfWord(text) { var beginningOfWord = true; for (var i = 0; i < text.length; i++) { diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 7ecdb4d..14e7be4 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -1,6 +1,7 @@ import { Enum } from 'enumify'; import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx'; + import CompactLines from './transformations/textitem/CompactLines.jsx'; import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; @@ -56,10 +57,10 @@ export default class AppState { new CompactLines(), new RemoveRepetitiveElements(), new VerticalToHorizontal(), - new PostprocessLines(), + // new PostprocessLines(), new DetectTOC(), new DetectHeaders(), - new CompleteFormats(), + // new CompleteFormats(), new DetectListItems(), new GatherBlocks(), diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx index 3817708..6463a04 100644 --- a/src/javascript/models/ElementType.jsx +++ b/src/javascript/models/ElementType.jsx @@ -1,83 +1,85 @@ import { Enum } from 'enumify'; -import TextItem from './TextItem.jsx'; -import TextItemBlock from './TextItemBlock.jsx'; +import LineItem from './LineItem.jsx'; +import LineItemBlock from './LineItemBlock.jsx'; // An Markdown element export default class ElementType extends Enum { } +//TODO rename to BlockType + ElementType.initEnum({ H1: { headline: true, headlineLevel: 1, - toText(block:TextItemBlock) { - return '# ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '# ' + concatLineItems(block.items); } }, H2: { headline: true, headlineLevel: 2, - toText(block:TextItemBlock) { - return '## ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '## ' + concatLineItems(block.items); } }, H3: { headline: true, headlineLevel: 3, - toText(block:TextItemBlock) { - return '### ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '### ' + concatLineItems(block.items); } }, H4: { headline: true, headlineLevel: 4, - toText(block:TextItemBlock) { - return '#### ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '#### ' + concatLineItems(block.items); } }, H5: { headline: true, headlineLevel: 5, - toText(block:TextItemBlock) { - return '##### ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '##### ' + concatLineItems(block.items); } }, H6: { headline: true, headlineLevel: 6, - toText(block:TextItemBlock) { - return '###### ' + concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return '###### ' + concatLineItems(block.items); } }, TOC: { mergeToBlock: true, - toText(block:TextItemBlock) { - return concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return concatLineItems(block.items); } }, FOOTNOTES: { mergeToBlock: true, mergeFollowingNonTypedItems: true, - toText(block:TextItemBlock) { - return concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return concatLineItems(block.items); } }, CODE: { mergeToBlock: true, - toText(block:TextItemBlock) { - return '```\n' + concatTextItems(block.textItems) + '```' + toText(block:LineItemBlock) { + return '```\n' + concatLineItems(block.items) + '```' } }, LIST: { mergeToBlock: true, mergeFollowingNonTypedItemsWithSmallDistance: true, - toText(block:TextItemBlock) { - return concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return concatLineItems(block.items); } }, PARAGRAPH: { - toText(block:TextItemBlock) { - return concatTextItems(block.textItems); + toText(block:LineItemBlock) { + return concatLineItems(block.items); } } }); @@ -86,17 +88,17 @@ export function isHeadline(elementType: ElementType) { return elementType && elementType.name.length == 2 && elementType.name[0] === 'H' } -export function blockToText(block: TextItemBlock) { +export function blockToText(block: LineItemBlock) { if (!block.type) { - return concatTextItems(block.textItems); + return concatLineItems(block.items); } return block.type.toText(block); } -function concatTextItems(textItems: TextItem[]) { +function concatLineItems(lineItems: LineItem[]) { var text = ''; - textItems.forEach(item => { - text += item.text + '\n'; + lineItems.forEach(item => { + text += item.text() + '\n'; }); return text; } diff --git a/src/javascript/models/HeadlineFinder.jsx b/src/javascript/models/HeadlineFinder.jsx index 4e7fe62..4061400 100644 --- a/src/javascript/models/HeadlineFinder.jsx +++ b/src/javascript/models/HeadlineFinder.jsx @@ -4,24 +4,25 @@ export default class HeadlineFinder { constructor(options) { this.headlineCharCodes = normalizedCharCodeArray(options.headline); - this.stackedTextItems = []; + this.stackedLineItems = []; this.stackedChars = 0; } - consume(textItem) { - const normalizedCharCodes = normalizedCharCodeArray(textItem.text); + consume(lineItem) { + //TODO avoid join + const normalizedCharCodes = normalizedCharCodeArray(lineItem.text()); const matchAll = this.matchAll(normalizedCharCodes); if (matchAll) { - this.stackedTextItems.push(textItem); + this.stackedLineItems.push(lineItem); this.stackedChars += normalizedCharCodes.length; if (this.stackedChars == this.headlineCharCodes.length) { - return this.stackedTextItems; + return this.stackedLineItems; } } else { if (this.stackedChars > 0) { this.stackedChars = 0; - this.stackedTextItems = []; - this.consume(textItem); // test again without stack + this.stackedLineItems = []; + this.consume(lineItem); // test again without stack } } return null; diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx new file mode 100644 index 0000000..5ea47ee --- /dev/null +++ b/src/javascript/models/LineConverter.jsx @@ -0,0 +1,145 @@ +import TextItem from './TextItem.jsx'; +import Word from './Word.jsx'; +import WordType from './markdown/WordType.jsx'; +import LineItem from './LineItem.jsx'; +import StashingStream from './StashingStream.jsx'; +import { ParsedElements } from './PageItem.jsx'; +import { isNumber } from '../functions.jsx' +import { sortByX } from '../pageItemFunctions.jsx' + +// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like +//'whitespace removal', bold/emphasis annotation, link-detection, etc.. +export default class LineConverter { + + constructor(fontToFormats) { + this.fontToFormats = fontToFormats; + } + + // returns a CombineResult + compact(textItems: TextItem[]) { + // we can't trust order of occurence, esp. footnoteLinks like to come last + sortByX(textItems); + + const wordStream = new WordDetectionStream(this.fontToFormats); + wordStream.consumeAll(textItems.map(item => new TextItem({ + ...item + }))); + const words = wordStream.complete(); + + var maxHeight = 0; + var widthSum = 0; + textItems.forEach(item => { + maxHeight = Math.max(maxHeight, item.height); + widthSum += item.width; + }); + return new LineItem({ + x: textItems[0].x, + y: textItems[0].y, + height: maxHeight, + width: widthSum, + words: words, + parsedElements: new ParsedElements({ + footnoteLinks: wordStream.footnoteLinks, + footnotes: wordStream.footnotes + }) + }); + + } + +} + +function itemsToWords(items, format) { + const combinedText = combineText(items); + // const combinedText = items.map(textItem => textItem.text).join(''); + const words = combinedText.split(' '); + return words.filter(w => w.trim().length > 0).map(word => { + return new Word({ + string: word, + type: format + }); + }); +} + +function combineText(textItems) { + var text = ''; + var lastItem; + textItems.forEach(textItem => { + if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) { + const xDistance = textItem.x - lastItem.x - lastItem.width; + if (xDistance > 5) { + text += ' '; + } + } + text += textItem.text; + lastItem = textItem; + }); + return text; +} + +class WordDetectionStream extends StashingStream { + + constructor(fontToFormats) { + super(); + this.fontToFormats = fontToFormats; + this.footnoteLinks = []; + this.footnotes = []; + + this.firstY; + this.stashedNumber = false; + this.currentItem; + } + + shouldStash(item) { // eslint-disable-line no-unused-vars + if (!this.firstY) { + this.firstY = item.y; + } + this.currentItem = item; + return true; + } + + onPushOnStash(item) { // eslint-disable-line no-unused-vars + this.stashedNumber = isNumber(item.text.trim()); + } + + doMatchesStash(lastItem, item) { + const lastItemFormat = this.fontToFormats.get(lastItem.font); + const itemFormat = this.fontToFormats.get(item.font); + if (lastItemFormat !== itemFormat) { + return false; + } + const itemIsANumber = isNumber(item.text.trim()); + return this.stashedNumber == itemIsANumber; + } + + doFlushStash(stash, results) { + if (this.stashedNumber) { + const joinedNumber = stash.map(item => item.text).join(''); + if (stash[0].y > this.firstY) { // footnote link + results.push(new Word({ + string: `${joinedNumber}`, + type: WordType.FOOTNOTE_LINK + //TODO format to + //^ + //`[${joinedNumber}](#${joinedNumber})` + })); + this.footnoteLinks.push(parseInt(joinedNumber)); + } else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote + results.push(new Word({ + string: `${joinedNumber}`, + type: WordType.FOOTNOTE + //TODO format to (^${ joinedNumber}): + })); + this.footnotes.push(joinedNumber); + } else { + this.copyStashItemsAsText(stash, results); + } + } else { + this.copyStashItemsAsText(stash, results); + } + } + + copyStashItemsAsText(stash, results) { + const format = this.fontToFormats.get(stash[0].font); + results.push(...itemsToWords(stash, format)); + } +} diff --git a/src/javascript/models/LineItem.jsx b/src/javascript/models/LineItem.jsx new file mode 100644 index 0000000..a52afcf --- /dev/null +++ b/src/javascript/models/LineItem.jsx @@ -0,0 +1,29 @@ +import PageItem from './PageItem.jsx' +import Word from './Word.jsx' + +//A line within a page +export default class LineItem extends PageItem { + + constructor(options) { + super(options); + this.x = options.x; + this.y = options.y; + this.width = options.width; + this.height = options.height; + this.words = options.words || []; + if (options.text && !options.words) { + this.words = options.text.split(" ").filter(string => string.trim().length > 0).map(wordAsString => new Word({ + string: wordAsString + })); + } + } + + text() { + return this.wordStrings().join(" "); + } + + wordStrings() { + return this.words.map(word => word.string); + } + +} diff --git a/src/javascript/models/LineItemBlock.jsx b/src/javascript/models/LineItemBlock.jsx new file mode 100644 index 0000000..64f9394 --- /dev/null +++ b/src/javascript/models/LineItemBlock.jsx @@ -0,0 +1,36 @@ +import PageItem from './PageItem.jsx' +import LineItem from './LineItem.jsx' + +// A block of LineItem[] within a Page +export default class LineItemBlock extends PageItem { + + constructor(options) { + super(options); + this.items = []; + if (options.items) { + options.items.forEach(item => this.addItem(item)); + } + } + + addItem(item:LineItem) { + if (this.type && item.type && this.type !== item.type) { + throw `Adding item of type ${item.type} to block of type ${this.type}` + } + if (!this.type) { + this.type = item.type; + } + if (item.parsedElements) { + if (this.parsedElements) { + this.parsedElements.add(item.parsedElements); + } else { + this.parsedElements = item.parsedElements; + } + } + const copiedItem = new LineItem({ + ...item + }); + copiedItem.type = null; + this.items.push(copiedItem); + } + +} diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx index dbb00af..72f465d 100644 --- a/src/javascript/models/PageItem.jsx +++ b/src/javascript/models/PageItem.jsx @@ -1,4 +1,4 @@ -// A abstract PageItem class, can be TextItem, or TextItemBlock +// A abstract PageItem class, can be TextItem, LineItem or LineItemBlock export default class PageItem { constructor(options) { diff --git a/src/javascript/models/StashingStream.jsx b/src/javascript/models/StashingStream.jsx new file mode 100644 index 0000000..630fb8d --- /dev/null +++ b/src/javascript/models/StashingStream.jsx @@ -0,0 +1,73 @@ +//Abstract stream which allows stash items temporarily +export default class StashingStream { + + constructor() { + if (this.constructor === StashingStream) { + throw new TypeError("Can not construct abstract class."); + } + this.results = []; + this.stash = []; + } + + consumeAll(items) { + items.forEach(item => this.consume(item)); + } + + consume(item) { + if (this.shouldStash(item)) { + if (!this.matchesStash(item)) { + this.flushStash(); + } + this.pushOnStash(item); + } else { + if (this.stash.length > 0) { + this.flushStash(); + } + this.results.push(item); + } + } + + pushOnStash(item) { + this.onPushOnStash(item); + this.stash.push(item); + } + + complete() { + if (this.stash.length > 0) { + this.flushStash(); + } + return this.results; + } + + // return true if the item matches the items of the stack + matchesStash(item) { + if (this.stash.length == 0) { + return true; + } + const lastItem = this.stash[this.stash.length - 1]; + return this.doMatchesStash(lastItem, item); + } + + flushStash() { + if (this.stash.length > 0) { + this.doFlushStash(this.stash, this.results); + this.stash = []; + } + } + + onPushOnStash(item) { // eslint-disable-line no-unused-vars + //sub-classes may override + } + + shouldStash(item) { + throw new TypeError("Do not call abstract method foo from child." + item); + } + + doMatchesStash(lastItem, item) { + throw new TypeError("Do not call abstract method foo from child." + lastItem + item); + } + + doFlushStash(stash, results) { + throw new TypeError("Do not call abstract method foo from child." + stash + results); + } +} \ No newline at end of file diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx index d013cc1..1565568 100644 --- a/src/javascript/models/TextItem.jsx +++ b/src/javascript/models/TextItem.jsx @@ -11,8 +11,6 @@ export default class TextItem extends PageItem { this.height = options.height; this.text = options.text; this.font = options.font; - this.fontAscent = options.fontAscent; - this.fontDescent = options.fontDescent; this.lineFormat = options.lineFormat; this.unopenedFormat = options.unopenedFormat; diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx deleted file mode 100644 index 9e19266..0000000 --- a/src/javascript/models/TextItemBlock.jsx +++ /dev/null @@ -1,36 +0,0 @@ -import PageItem from './PageItem.jsx' -import TextItem from './TextItem.jsx' - -// A block of TextItem[] within a Page -export default class TextItemBlock extends PageItem { - - constructor(options) { - super(options); - this.textItems = []; - if (options.textItems) { - options.textItems.forEach(item => this.addTextItem(item)); - } - } - - addTextItem(textItem:TextItem) { - if (this.type && textItem.type && this.type !== textItem.type) { - throw `Adding text item of type ${textItem.type} to block of type ${this.type}` - } - if (!this.type) { - this.type = textItem.type; - } - if (textItem.parsedElements) { - if (this.parsedElements) { - this.parsedElements.add(textItem.parsedElements); - } else { - this.parsedElements = textItem.parsedElements; - } - } - const copiedTextItem = new TextItem({ - ...textItem - }); - copiedTextItem.type = null; - this.textItems.push(copiedTextItem); - } - -} diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx deleted file mode 100644 index ace1083..0000000 --- a/src/javascript/models/TextItemLineCompactor.jsx +++ /dev/null @@ -1,227 +0,0 @@ -import TextItem from './TextItem.jsx'; -import { ParsedElements } from './PageItem.jsx'; -import { isNumber } from '../functions.jsx' -import { sortByX } from '../textItemFunctions.jsx' -import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx'; - -// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like -//'whitespace removal', bold/emphasis annotation, link-detection, etc.. -export default class TextItemLineCompactor { - - constructor(fontToFormats) { - this.fontToFormats = fontToFormats; - } - - // returns a CombineResult - compact(lineItems: TextItem[]) { - if (lineItems.length < 2) { - throw "Must be at least 2 line items, but was " + lineItems; - } - - // we can't trust order of occurence, esp. footnoteLinks like to come last - sortByX(lineItems); - - const formatter = new Formatter(this.fontToFormats); - var [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems); - resolvedLineItems.forEach(item => formatter.consume(item)); - resolvedLineItems = formatter.getResults(); - parsedElements.inlineFormats = formatter.inlineFormats; - // const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements); - - var combinedItem; - if (resolvedLineItems.length == 1) { - combinedItem = resolvedLineItems[0]; - } else { - var text = ''; - var maxHeight = 0; - var widthSum = 0; - var lastItem; - resolvedLineItems.forEach(item => { - if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) { - const xDistance = item.x - lastItem.x - lastItem.width; - if (xDistance >= 5) { - text += ' '; - } - } - text += item.text; - widthSum += item.width; - lastItem = item; - maxHeight = Math.max(maxHeight, item.height); - }); - combinedItem = new TextItem({ - ...resolvedLineItems[0], - text: text, - height: maxHeight, - width: widthSum - }); - } - combinedItem.parsedElements = parsedElements; - combinedItem.lineFormat = formatter.lineFormat; - combinedItem.unopenedFormat = formatter.unopenedFormat; - combinedItem.unclosedFormat = formatter.unclosedFormat; - return combinedItem; - } - - - resolveSpecialElements(lineItems) { - const footnoteLinks = []; - const footnotes = []; - const basicY = lineItems[0].y; - const newLineItems = []; - var stashedNumberItems = []; - - const commitStashedNumbers = (nextItem) => { - if (stashedNumberItems.length > 0) { - const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join(''); - if (stashedNumberItems[0].y > basicY) { // footnote link - newLineItems.push(new TextItem({ - ...stashedNumberItems[0], - //TODO make fomatting configurable - // text: `[${joinedNumber}](#${joinedNumber})` - text: `^${joinedNumber}` - })); - footnoteLinks.push(parseInt(joinedNumber)); - } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote - //TODO womb comp [29] => ydiff == 0 - newLineItems.push(new TextItem({ - ...stashedNumberItems[0], - text: `(^${ joinedNumber}): ` - })); - footnotes.push(joinedNumber); - } else { - stashedNumberItems.forEach(number => newLineItems.push(number)); - } - - stashedNumberItems = []; - } - }; - - lineItems.forEach(item => { - if (newLineItems.length == 0 && item.text.trim().length == 0) { - // skip whitespace on the beginning of a line - } else { - const isANumber = isNumber(item.text.trim()); - if (isANumber) { - stashedNumberItems.push(item); - } else { - if (stashedNumberItems.length > 0) { - commitStashedNumbers(item); - } - newLineItems.push(item); - } - } - }); - commitStashedNumbers(); - - - return [newLineItems, new ParsedElements({ - footnoteLinks: footnoteLinks, - footnotes: footnotes - })]; - } - -} - -class Formatter { - - constructor(fontToFormats) { - this.fontToFormats = fontToFormats; - - this.resultItems = []; - this.lineFormat; - this.unopenedFormat; - this.unclosedFormat; - - this.openFormat; - this.stashedItems = []; - this.inlineFormats = 0; - this.lastItem; - } - - - consume(item) { - const formatType = this.fontToFormats.get(item.font); - if (this.openFormat && formatType !== this.openFormat) { - this.flushStash(false); - } - if (formatType.needFormat) { - this.openFormat = formatType; - this.stashedItems.push(item); - } else { - this.resultItems.push(item); - } - } - - getResults() { - if (this.openFormat) { - this.flushStash(true); - } - return this.resultItems; - } - - flushStash(formatToEndOfLine) { - const formatFromBeginningOfLine = this.resultItems == 0; - if (formatFromBeginningOfLine) { - if (formatToEndOfLine) { - this.lineFormat = this.openFormat; - this.moveStashItemsToResult(); - } else { - this.unopenedFormat = this.openFormat; - const newLastItem = this.newClosingItem(this.stashedItems.pop()); - this.moveStashItemsToResult(); - this.resultItems.push(newLastItem); - } - } else { - if (formatToEndOfLine) { - this.unclosedFormat = this.openFormat; - const newFirstItem = this.newOpeningItem(this.stashedItems.shift()); - this.resultItems.push(newFirstItem); - this.moveStashItemsToResult(); - } else { - this.inlineFormats++; - if (this.stashedItems.length == 1) { - const onlyItem = this.stashedItems.pop(); - if (onlyItem.text.trim().length > 0) { - const onlyItemFormatted = this.newCompleteItem(onlyItem); - this.resultItems.push(onlyItemFormatted); - } - this.moveStashItemsToResult(); - } else { - const firstItem = this.newOpeningItem(this.stashedItems.shift()); - const lastItem = this.newClosingItem(this.stashedItems.pop()); - this.resultItems.push(firstItem); - this.moveStashItemsToResult(); - this.resultItems.push(lastItem); - } - } - } - } - - moveStashItemsToResult() { - this.resultItems.push(...this.stashedItems); - this.stashedItems = []; - this.openFormat = null; - } - - newOpeningItem(item) { - return new TextItem({ - ...item, - text: prefixAfterWhitespace(this.openFormat.startSymbol, item.text) - }); - } - - newClosingItem(item) { - return new TextItem({ - ...item, - text: suffixBeforeWhitespace(item.text, this.openFormat.endSymbol) - }); - } - - newCompleteItem(item) { - return new TextItem({ - ...item, - text: suffixBeforeWhitespace(prefixAfterWhitespace(this.openFormat.startSymbol, item.text), this.openFormat.endSymbol) - }); - } - -} diff --git a/src/javascript/models/TextItemLineGrouper.jsx b/src/javascript/models/TextItemLineGrouper.jsx index 54264b5..379191f 100644 --- a/src/javascript/models/TextItemLineGrouper.jsx +++ b/src/javascript/models/TextItemLineGrouper.jsx @@ -1,5 +1,5 @@ import TextItem from './TextItem.jsx'; -import { sortByX } from '../textItemFunctions.jsx' +import { sortByX } from '../pageItemFunctions.jsx' //Groups all text items which are on the same y line export default class TextItemLineGrouper { diff --git a/src/javascript/models/Word.jsx b/src/javascript/models/Word.jsx new file mode 100644 index 0000000..b559c6c --- /dev/null +++ b/src/javascript/models/Word.jsx @@ -0,0 +1,8 @@ +export default class Word { + + constructor(options) { + this.string = options.string; + this.type = options.type; // WordType + } + +} \ No newline at end of file diff --git a/src/javascript/models/markdown/WordType.jsx b/src/javascript/models/markdown/WordType.jsx new file mode 100644 index 0000000..64a531d --- /dev/null +++ b/src/javascript/models/markdown/WordType.jsx @@ -0,0 +1,7 @@ +import { Enum } from 'enumify'; + +// An Markdown word element +export default class WordType extends Enum { +} + +WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']); \ No newline at end of file diff --git a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx similarity index 73% rename from src/javascript/models/transformations/ToTextItemBlockTransformation.jsx rename to src/javascript/models/transformations/ToLineItemBlockTransformation.jsx index 43be1ec..92e2524 100644 --- a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx +++ b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx @@ -1,16 +1,16 @@ import React from 'react'; import Transformation from './Transformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import TextItemBlock from '../TextItemBlock.jsx'; -import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx'; +import LineItemBlock from '../LineItemBlock.jsx'; +import LineItemBlockPageView from '../../components/debug/LineItemBlockPageView.jsx'; import { REMOVED_ANNOTATION } from '../Annotation.jsx'; -// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView -export default class ToTextItemBlockTransformation extends Transformation { +// Abstract class for transformations producing LineItemBlock(s) to be shown in the LineItemBlockPageView +export default class ToLineItemBlockTransformation extends Transformation { constructor(name) { - super(name, TextItemBlock.name); - if (this.constructor === ToTextItemBlockTransformation) { + super(name, LineItemBlock.name); + if (this.constructor === ToLineItemBlockTransformation) { throw new TypeError("Can not construct abstract class."); } this.showWhitespaces = false; @@ -25,7 +25,7 @@ export default class ToTextItemBlockTransformation extends Transformation { } createPageView(page, modificationsOnly) { - return ; + } + + completeTransform(parseResult:ParseResult) { + // The usual cleanup + parseResult.messages = []; + parseResult.pages.forEach(page => { + page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION); + page.items.forEach(item => item.annotation = null); + }); + return parseResult; + } + + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx index e28d19f..7a06d00 100644 --- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx @@ -1,6 +1,7 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import StringFormat from '../../StringFormat.jsx'; +import WordType from '../../markdown/WordType.jsx'; +// import StringFormat from '../../StringFormat.jsx'; export default class CalculateGlobalStats extends ToTextItemTransformation { @@ -54,21 +55,21 @@ export default class CalculateGlobalStats extends ToTextItemTransformation { this.fontMap.forEach(function(value, key) { fontIdToName.push(key + " = " + value.name) const fontName = value.name.toLowerCase(); - var format; + var type; if (key == mostUsedFont) { - format = StringFormat.STANDARD; + type = null; } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) { - format = StringFormat.BOLD_OBLIQUE; + type = WordType.BOLD_OBLIQUE; } else if (fontName.includes('bold')) { - format = StringFormat.BOLD; + type = WordType.BOLD; } else if (fontName.includes('oblique') || fontName.includes('italic')) { - format = StringFormat.OBLIQUE; + type = WordType.OBLIQUE; } else if (fontName === maxHeightFont) { - format = StringFormat.BOLD; - } else { - format = StringFormat.STANDARD; + type = WordType.BOLD; + } + if (type) { + fontToFormats.set(key, type); } - fontToFormats.set(key, format); }); fontIdToName.sort(); diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx index 4b5b6f8..12446bb 100644 --- a/src/javascript/models/transformations/textitem/CompactLines.jsx +++ b/src/javascript/models/transformations/textitem/CompactLines.jsx @@ -1,16 +1,16 @@ import React from 'react'; -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import { ParsedElements } from '../../PageItem.jsx'; +import LineItem from '../../LineItem.jsx'; import TextItemLineGrouper from '../../TextItemLineGrouper.jsx'; -import TextItemLineCompactor from '../../TextItemLineCompactor.jsx'; +import LineConverter from '../../LineConverter.jsx'; import ElementType from '../../ElementType.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; -// gathers text items on the same y line to one text item -export default class CompactLines extends ToTextItemTransformation { +// gathers text items on the same y line to one line item +export default class CompactLines extends ToLineItemTransformation { constructor() { super("Compact To Lines"); @@ -20,58 +20,44 @@ export default class CompactLines extends ToTextItemTransformation { const {mostUsedDistance, fontToFormats} = parseResult.globals; const foundFootnotes = []; const foundFootnoteLinks = []; - var inlineFormats = 0; - var lineFormats = 0; - var unopenedFormats = 0; - var unclosedFormats = 0; + var formattedWords = 0; const lineGrouper = new TextItemLineGrouper({ mostUsedDistance: mostUsedDistance, }); - const lineCompactor = new TextItemLineCompactor(fontToFormats); + const lineCompactor = new LineConverter(fontToFormats); parseResult.pages.forEach(page => { if (page.items.length > 0) { - const newItems = []; + const lineItems = []; const textItemsGroupedByLine = lineGrouper.group(page.items); - textItemsGroupedByLine.forEach(textItemsOfLine => { - var lineItem; - if (textItemsOfLine.length == 1) { - lineItem = textItemsOfLine[0]; - const formatType = fontToFormats.get(lineItem.font); - if (formatType.needFormat) { - lineItem.lineFormat = formatType; - lineItem.parsedElements = new ParsedElements({ - completeLineFormats: 1 - }); - } - } else { - textItemsOfLine.forEach(item => { - item.annotation = REMOVED_ANNOTATION; - newItems.push(item); - }); - - lineItem = lineCompactor.compact(textItemsOfLine); + textItemsGroupedByLine.forEach(lineTextItems => { + const lineItem = lineCompactor.compact(lineTextItems); + if (lineTextItems.length > 1) { lineItem.annotation = ADDED_ANNOTATION; - - if (lineItem.parsedElements.footnoteLinks.length > 0) { - const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); - foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); - } - if (lineItem.parsedElements.footnotes.length > 0) { - lineItem.type = ElementType.FOOTNOTES; - const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },); - foundFootnotes.push.apply(foundFootnotes, footnotes); - } - inlineFormats += lineItem.parsedElements.inlineFormats; + lineTextItems.forEach(item => { + item.annotation = REMOVED_ANNOTATION; + lineItems.push(new LineItem({ + ...item + })); + }); + } + if (lineItem.words.length == 0) { + lineItem.annotation = REMOVED_ANNOTATION; + } + lineItems.push(lineItem); + + if (lineItem.parsedElements.footnoteLinks.length > 0) { + const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); + foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); + } + if (lineItem.parsedElements.footnotes.length > 0) { + lineItem.type = ElementType.FOOTNOTES; + const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },); + foundFootnotes.push.apply(foundFootnotes, footnotes); } - if (lineItem.lineFormat) lineFormats++; - if (lineItem.unopenedFormat) unopenedFormats++; - if (lineItem.unclosedFormat) unclosedFormats++; - lineItem.text = lineItem.text.trim(); - newItems.push(lineItem); }); - page.items = newItems; + page.items = lineItems; } }); @@ -79,11 +65,8 @@ export default class CompactLines extends ToTextItemTransformation { return new ParseResult({ ...parseResult, messages: [ - 'Detected ' + lineFormats + ' line formats', - 'Detected ' + inlineFormats + ' inline formats', - 'Detected ' + unclosedFormats + ' opened un-closed formats', - 'Detected ' + unopenedFormats + ' un-opened closed formats', - Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }], + 'Detected ' + formattedWords + ' formatted words', + Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }], Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }], ] }); diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx index c073f34..e6ac7f8 100644 --- a/src/javascript/models/transformations/textitem/CompleteFormats.jsx +++ b/src/javascript/models/transformations/textitem/CompleteFormats.jsx @@ -6,6 +6,8 @@ import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../. //Complete unopened/unclosed bold/italic formats export default class CompleteFormats extends ToTextItemTransformation { + //TODO move to block and ignore quotes + constructor() { super("Complete Bold/Italics"); } @@ -81,7 +83,6 @@ class ItemStack { } consume(item) { - const te = item.text; var newItem; const handleFreshUnopened = () => { diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index 6d3d20d..93d36ad 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -1,4 +1,4 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; @@ -6,7 +6,7 @@ import { headlineByLevel } from '../../ElementType.jsx'; import { isListItem } from '../../../functions.jsx'; //Detect items starting with -, •, etc... -export default class DetectHeaders extends ToTextItemTransformation { +export default class DetectHeaders extends ToLineItemTransformation { constructor() { super("Detect Headers"); @@ -21,15 +21,15 @@ export default class DetectHeaders extends ToTextItemTransformation { const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight); const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); pagesWithMaxHeight.forEach(titlePage => { - titlePage.items.forEach(textItem => { - const height = textItem.height; - if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) { + titlePage.items.forEach(item => { + const height = item.height; + if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) { if (height == maxHeight) { - textItem.type = ElementType.H1; + item.type = ElementType.H1; } else { - textItem.type = ElementType.H2; + item.type = ElementType.H2; } - textItem.annotation = DETECTED_ANNOTATION; + item.annotation = DETECTED_ANNOTATION; detectedHeaders++; } }); @@ -41,10 +41,10 @@ export default class DetectHeaders extends ToTextItemTransformation { var range = headlineTypeToHeightRange[headlineType]; if (range.max > mostUsedHeight) { //use only very clear headlines, only use max parseResult.pages.forEach(page => { - page.items.forEach(textItem => { - if (!textItem.type && textItem.height == range.max) { - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = ElementType.enumValueOf(headlineType); + page.items.forEach(item => { + if (!item.type && item.height == range.max) { + item.annotation = DETECTED_ANNOTATION; + item.type = ElementType.enumValueOf(headlineType); detectedHeaders++ } }); @@ -56,10 +56,10 @@ export default class DetectHeaders extends ToTextItemTransformation { const heights = []; var lastHeight; parseResult.pages.forEach(page => { - page.items.forEach(textItem => { - if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) { - if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) { - heights.push(textItem.height); + page.items.forEach(item => { + if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) { + if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) { + heights.push(item.height); } } }); @@ -69,11 +69,11 @@ export default class DetectHeaders extends ToTextItemTransformation { heights.forEach((height, i) => { const headlineType = headlineByLevel(2 + i); parseResult.pages.forEach(page => { - page.items.forEach(textItem => { - if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) { + page.items.forEach(item => { + if (!item.type && item.height == height && !isListItem(item.text())) { detectedHeaders++; - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = headlineType; + item.annotation = DETECTED_ANNOTATION; + item.type = headlineType; } }); }); @@ -83,9 +83,9 @@ export default class DetectHeaders extends ToTextItemTransformation { //find headlines which have paragraph height var smallesHeadlineLevel = 1; parseResult.pages.forEach(page => { - page.items.forEach(textItem => { - if (textItem.type && textItem.type.headline) { - smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel); + page.items.forEach(item => { + if (item.type && item.type.headline) { + smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel); } }); }); @@ -93,18 +93,18 @@ export default class DetectHeaders extends ToTextItemTransformation { const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1); parseResult.pages.forEach(page => { var lastItem; - page.items.forEach(textItem => { - if (!textItem.type - && textItem.height == mostUsedHeight - && textItem.font !== mostUsedFont - && (!lastItem || lastItem.y < textItem.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - textItem.y > mostUsedDistance * 2)) - && textItem.text === textItem.text.toUpperCase() + page.items.forEach(item => { + if (!item.type + && item.height == mostUsedHeight + && item.font !== mostUsedFont + && (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2)) + && item.text() === item.text().toUpperCase() ) { detectedHeaders++; - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = nextHeadlineType; + item.annotation = DETECTED_ANNOTATION; + item.type = nextHeadlineType; } - lastItem = textItem; + lastItem = item; }); }); } @@ -124,8 +124,8 @@ export default class DetectHeaders extends ToTextItemTransformation { function findPagesWithMaxHeight(pages, maxHeight) { const maxHeaderPagesSet = new Set(); pages.forEach(page => { - page.items.forEach(textItem => { - if (!textItem.type && textItem.height == maxHeight) { + page.items.forEach(item => { + if (!item.type && item.height == maxHeight) { maxHeaderPagesSet.add(page); } }); diff --git a/src/javascript/models/transformations/textitem/DetectListItems.jsx b/src/javascript/models/transformations/textitem/DetectListItems.jsx index 0c99ee2..8bdd197 100644 --- a/src/javascript/models/transformations/textitem/DetectListItems.jsx +++ b/src/javascript/models/transformations/textitem/DetectListItems.jsx @@ -1,12 +1,12 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import TextItem from '../../TextItem.jsx'; +import LineItem from '../../LineItem.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; //Detect items starting with -, •, etc... -export default class DetectListItems extends ToTextItemTransformation { +export default class DetectListItems extends ToLineItemTransformation { constructor() { super("Detect List Items"); @@ -16,34 +16,34 @@ export default class DetectListItems extends ToTextItemTransformation { var foundListItems = 0; var foundNumberedItems = 0; parseResult.pages.forEach(page => { - const newTextItems = []; - page.items.forEach(textItem => { - newTextItems.push(textItem); - if (!textItem.type) { - var text = textItem.text; + const newItems = []; + page.items.forEach(item => { + newItems.push(item); + if (!item.type) { + var text = item.text(); if (isListItem(text)) { foundListItems++ const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length); if (textWithDash === text) { - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = ElementType.LIST; + item.annotation = DETECTED_ANNOTATION; + item.type = ElementType.LIST; } else { - textItem.annotation = REMOVED_ANNOTATION; - newTextItems.push(new TextItem({ - ...textItem, + item.annotation = REMOVED_ANNOTATION; + newItems.push(new LineItem({ + ...item, text: textWithDash, annotation: ADDED_ANNOTATION, type: ElementType.LIST })); } - } else if (isNumberedListItem(text)) { + } else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra) foundNumberedItems++; - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = ElementType.LIST; + item.annotation = DETECTED_ANNOTATION; + item.type = ElementType.LIST; } } }); - page.items = newTextItems; + page.items = newItems; }); return new ParseResult({ diff --git a/src/javascript/models/transformations/textitem/DetectTOC.jsx b/src/javascript/models/transformations/textitem/DetectTOC.jsx index ff9ecf3..93ca758 100644 --- a/src/javascript/models/transformations/textitem/DetectTOC.jsx +++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx @@ -1,14 +1,15 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import TextItem from '../../TextItem.jsx'; +import LineItem from '../../LineItem.jsx'; +import Word from '../../Word.jsx'; import HeadlineFinder from '../../HeadlineFinder.jsx'; -import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx'; -import { isDigit, wordMatch } from '../../../functions.jsx' +import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx' -//Detect table of contents pages -export default class DetectTOC extends ToTextItemTransformation { +//Detect table of contents pages plus linked headlines +export default class DetectTOC extends ToLineItemTransformation { constructor() { super("Detect TOC"); @@ -17,64 +18,68 @@ export default class DetectTOC extends ToTextItemTransformation { transform(parseResult:ParseResult) { const tocPages = []; const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); - const linkLeveler = new LinkLeveler(); + + var tocLinks = []; var lastTocPage; var headlineItem; parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { - const lineItemsWithDigits = []; + var lineItemsWithDigits = 0; const unknownLines = new Set(); const pageTocLinks = []; - var lastLineTextWithoutNumber; + var lastWordsWithoutNumber; var lastLine; + //find lines ending with a number per page page.items.forEach(line => { - var lineText = line.text.replace(/\./g, '').trim(); - var endsWithDigit = false; - var digits = []; - while (isDigit(lineText.charCodeAt(lineText.length - 1))) { - digits.unshift(lineText.charAt(lineText.length - 1)); - lineText = lineText.substring(0, lineText.length - 1); - endsWithDigit = true; + var words = line.words.filter(word => !hasOnly(word.string, '.')); + const digits = []; + while (words.length > 0 && isNumber(words[words.length - 1].string)) { + const lastWord = words.pop(); + digits.unshift(lastWord.string); } - lineText = lineText.trim(); + + if (digits.length == 0 && words.length > 0) { + const lastWord = words[words.length - 1]; + while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) { + digits.unshift(lastWord.string.charAt(lastWord.string.length - 1)) + lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1); + } + } + var endsWithDigit = digits.length > 0; if (endsWithDigit) { endsWithDigit = true; - if (lastLineTextWithoutNumber) { // 2-line item ? - lineText = lastLineTextWithoutNumber + ' ' + lineText; - lastLineTextWithoutNumber = null; + if (lastWordsWithoutNumber) { // 2-line item ? + words.push(...lastWordsWithoutNumber); + lastWordsWithoutNumber = null; } pageTocLinks.push(new TocLink({ pageNumber: parseInt(digits.join('')), - textItem: new TextItem({ + lineItem: new LineItem({ ...line, - text: lineText + words: words }) })); - lineItemsWithDigits.push(new TextItem({ - ...line, - text: lineText - })); - lastLineTextWithoutNumber = null; + lineItemsWithDigits++; } else { if (!headlineItem) { headlineItem = line; } else { - if (lastLineTextWithoutNumber) { + if (lastWordsWithoutNumber) { unknownLines.add(lastLine); } - lastLineTextWithoutNumber = lineText; + lastWordsWithoutNumber = words; lastLine = line; } } }); // page has been processed - if (lineItemsWithDigits.length * 100 / page.items.length > 75) { + if (lineItemsWithDigits * 100 / page.items.length > 75) { tocPages.push(page.index + 1); lastTocPage = page; linkLeveler.levelPageItems(pageTocLinks); - tocLinks = tocLinks.concat(pageTocLinks); + tocLinks.push(...pageTocLinks); const newBlocks = []; page.items.forEach((line) => { @@ -83,7 +88,7 @@ export default class DetectTOC extends ToTextItemTransformation { } newBlocks.push(line); if (line === headlineItem) { - newBlocks.push(new TextItem({ + newBlocks.push(new LineItem({ ...line, type: ElementType.H2, annotation: ADDED_ANNOTATION @@ -105,8 +110,10 @@ export default class DetectTOC extends ToTextItemTransformation { if (tocPages.length > 0) { // Add TOC items tocLinks.forEach(tocLink => { - lastTocPage.items.push(new TextItem({ - text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, + lastTocPage.items.push(new LineItem({ + words: [new Word({ + string: ' '.repeat(tocLink.level * 3) + '-' + })].concat(tocLink.lineItem.words), type: ElementType.TOC, annotation: ADDED_ANNOTATION })); @@ -118,11 +125,11 @@ export default class DetectTOC extends ToTextItemTransformation { var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping]; var foundHealineItems; if (linkedPage) { - foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text); + foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text()); if (!foundHealineItems) { // pages are off by 1 ? linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1]; if (linkedPage) { - foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text); + foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text()); } } } @@ -142,11 +149,16 @@ export default class DetectTOC extends ToTextItemTransformation { const headlineType = headlineByLevel(notFoundTocLink.level + 2); const heightRange = headlineTypeToHeightRange[headlineType.name]; if (heightRange) { - const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); - if (textItem) { - textItem.type = headlineType; - textItem.annotation = DETECTED_ANNOTATION; - foundBySize.push(textItem.text); + const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); + if (lineIndex > -1) { + const page = parseResult.pages[pageIndex]; + page.items[lineIndex].annotation = REMOVED_ANNOTATION; + page.items.splice(lineIndex + 1, 0, new LineItem({ + ...notFoundTocLink.lineItem, + type: headlineType, + annotation: ADDED_ANNOTATION, + })); + foundBySize.push(notFoundTocLink); } } }); @@ -173,12 +185,12 @@ export default class DetectTOC extends ToTextItemTransformation { const messages = []; messages.push('Detected ' + tocPages.length + ' table of content pages'); if (tocPages.length > 0) { - messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines); messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange)); + messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines); } if (notFoundHeadlines.length > 0) { - messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber)); - messages.push('Found TOC headlines (by size): ' + foundBySize); + messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text())); + messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber)); } return new ParseResult({ ...parseResult, @@ -196,7 +208,7 @@ export default class DetectTOC extends ToTextItemTransformation { //Find out how the TOC page link actualy translates to the page.index function detectPageMappingNumber(pages, tocLinks) { for ( var tocLink of tocLinks ) { - const page = findPageWithHeadline(pages, tocLink.textItem.text); + const page = findPageWithHeadline(pages, tocLink.lineItem.text()); if (page) { return page.index - tocLink.pageNumber; } @@ -235,9 +247,9 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); const headlineType = headlineByLevel(tocLink.level + 2); const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0); - page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({ + page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({ ...foundItems.headlineItems[0], - text: tocLink.textItem.text, + words: tocLink.lineItem.words, height: headlineHeight, type: headlineType, annotation: ADDED_ANNOTATION @@ -255,21 +267,22 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) } } -function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) { +function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) { + const linkText = tocLink.lineItem.text().toUpperCase(); for (var i = fromPage; i <= toPage; i++) { const page = pages[i - 1]; - for ( var line of page.items ) { + const lineIndex = page.items.findIndex(line => { if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) { - const match = wordMatch(tocLink.textItem.text, line.text); - if (match >= 0.5) { - return line; - } + const match = wordMatch(linkText, line.text()); + return match >= 0.5; } - } + return false; + }); + if (lineIndex > -1) return [i - 1, lineIndex]; } + return [-1, -1]; } - class LinkLeveler { constructor() { this.levelByMethod = null; @@ -297,13 +310,13 @@ class LinkLeveler { levelByXDiff(tocLinks) { const uniqueX = this.calculateUniqueX(tocLinks); tocLinks.forEach(link => { - link.level = uniqueX.indexOf(link.textItem.x); + link.level = uniqueX.indexOf(link.lineItem.x); }); } levelByFont(tocLinks) { tocLinks.forEach(link => { - link.level = this.uniqueFonts.indexOf(link.textItem.font); + link.level = this.uniqueFonts.indexOf(link.lineItem.font); }); } @@ -315,7 +328,7 @@ class LinkLeveler { calculateUniqueX(tocLinks) { var uniqueX = tocLinks.reduce(function(uniquesArray, link) { - if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x); + if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x); return uniquesArray; }, []); @@ -328,7 +341,7 @@ class LinkLeveler { calculateUniqueFonts(tocLinks) { var uniqueFont = tocLinks.reduce(function(uniquesArray, link) { - if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font); + if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font); return uniquesArray; }, []); @@ -339,7 +352,7 @@ class LinkLeveler { class TocLink { constructor(options) { - this.textItem = options.textItem; + this.lineItem = options.lineItem; this.pageNumber = options.pageNumber; this.level = 0; } diff --git a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx index 3d56883..d84f82e 100644 --- a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx +++ b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx @@ -1,4 +1,4 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import { REMOVED_ANNOTATION } from '../../Annotation.jsx'; @@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) { // Remove elements with similar content on same page positions, like page numbers, licenes information, etc... -export default class RemoveRepetitiveElements extends ToTextItemTransformation { +export default class RemoveRepetitiveElements extends ToLineItemTransformation { constructor() { super("Remove Repetitive Elements"); @@ -58,8 +58,8 @@ export default class RemoveRepetitiveElements extends ToTextItemTransformation { maxElements: [] }); - const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), '')); - const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), '')); + const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), '')); + const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), '')); pageStore.push({ minElements: minMaxItems.minElements, maxElements: minMaxItems.maxElements, diff --git a/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx b/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx index 6290d22..649df96 100644 --- a/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx +++ b/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx @@ -1,10 +1,11 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import TextItem from '../../TextItem.jsx'; +import LineItem from '../../LineItem.jsx'; +import StashingStream from '../../StashingStream.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; // Converts vertical text to horizontal -export default class VerticalToHorizontal extends ToTextItemTransformation { +export default class VerticalToHorizontal extends ToLineItemTransformation { constructor() { super("Vertical to Horizontal Text"); @@ -12,87 +13,64 @@ export default class VerticalToHorizontal extends ToTextItemTransformation { transform(parseResult:ParseResult) { var foundVerticals = 0; - const newPages = parseResult.pages.map(page => { - const newTextItems = []; - // var oneCharacterItems = []; - - // const applyTransformation = () => { - // oneCharacterItems.forEach(item => { - // item.annotation = REMOVED_ANNOTATION; - // newTextItems.push(item); - // //TODO add new - // }); - // oneCharacterItems = []; - // }; - // const rollbackTransformation = () => { - // oneCharacterItems.forEach(item => { - // newTextItems.push(item); - // }); - // oneCharacterItems = []; - // }; - - //TODO generic state machine code ? - - const leftOver = page.items.reduce((oneCharacterItems, item) => { - if (item.text.trim().length == 1) { - if (oneCharacterItems.length == 0) { - oneCharacterItems.push(item); - } else { - const lastItem = oneCharacterItems[oneCharacterItems.length - 1]; - if (lastItem.y - item.y > 5 && lastItem.font === item.font) { - oneCharacterItems.push(item); - } else { - if (oneCharacterItems.length > 5) { - var combinedText = ''; - var minX = 999; - var maxY = 0; - var sumWidth = 0; - var maxHeight = 0; - oneCharacterItems.forEach(oneCharacterItem => { - oneCharacterItem.annotation = REMOVED_ANNOTATION; - newTextItems.push(oneCharacterItem); - combinedText += oneCharacterItem.text.trim(); - minX = Math.min(minX, oneCharacterItem.x); - maxY = Math.max(maxY, oneCharacterItem.y); - sumWidth += oneCharacterItem.width; - maxHeight = Math.max(maxHeight, oneCharacterItem.height); - }); - newTextItems.push(new TextItem({ - ...oneCharacterItems[0], - x: minX, - y: maxY, - width: sumWidth, - height: maxHeight, - text: combinedText, - annotation: ADDED_ANNOTATION - })); - foundVerticals++; - } else { - oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem)); - } - oneCharacterItems = [item]; - } - } - } else { - oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem)); - oneCharacterItems = []; - newTextItems.push(item); - } - return oneCharacterItems; - }, []); - leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem)); - - return { - ...page, - items: newTextItems - }; + parseResult.pages.forEach(page => { + const stream = new VerticalsStream(); + stream.consumeAll(page.items); + page.items = stream.complete(); + foundVerticals += stream.foundVerticals; }); + return new ParseResult({ ...parseResult, - pages: newPages, messages: ["Converted " + foundVerticals + " verticals"] }); } - +} + +class VerticalsStream extends StashingStream { + + constructor() { + super(); + this.foundVerticals = 0; + } + + shouldStash(item) { + return item.words.length == 1 && item.words[0].string.length == 1; + } + + doMatchesStash(lastItem, item) { + return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type; + } + + doFlushStash(stash, results) { + if (stash.length > 5) { // unite + var combinedWords = []; + var minX = 999; + var maxY = 0; + var sumWidth = 0; + var maxHeight = 0; + stash.forEach(oneCharacterLine => { + oneCharacterLine.annotation = REMOVED_ANNOTATION; + results.push(oneCharacterLine); + combinedWords.push(oneCharacterLine.words[0]); + minX = Math.min(minX, oneCharacterLine.x); + maxY = Math.max(maxY, oneCharacterLine.y); + sumWidth += oneCharacterLine.width; + maxHeight = Math.max(maxHeight, oneCharacterLine.height); + }); + results.push(new LineItem({ + ...stash[0], + x: minX, + y: maxY, + width: sumWidth, + height: maxHeight, + words: combinedWords, + annotation: ADDED_ANNOTATION + })); + this.foundVerticals++; + } else { //add as singles + results.push(...stash); + } + } } diff --git a/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx b/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx index 1f96b88..6b0dd45 100644 --- a/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx +++ b/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx @@ -1,11 +1,11 @@ -import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx'; +import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; -import { minXFromBlocks } from '../../../textItemFunctions.jsx'; +import { minXFromBlocks } from '../../../pageItemFunctions.jsx'; //Detect items which are code/quote blocks -export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation { +export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation { constructor() { super("Detect Code/Quote Blocks"); @@ -17,7 +17,7 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation parseResult.pages.forEach(page => { var minX = minXFromBlocks(page.items); page.items.forEach(block => { - if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) { + if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) { block.annotation = DETECTED_ANNOTATION; block.type = ElementType.CODE; foundCodeItems++; @@ -36,14 +36,14 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation } -function looksLikeCodeBlock(minX, textItems, mostUsedHeight) { - if (textItems.length == 0) { +function looksLikeCodeBlock(minX, items, mostUsedHeight) { + if (items.length == 0) { return false; } - if (textItems.length == 1) { - return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1; + if (items.length == 1) { + return items[0].x > minX && items[0].height <= mostUsedHeight + 1; } - for ( var item of textItems ) { + for ( var item of items ) { if (item.x == minX) { return false; } diff --git a/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx b/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx index e058c2b..eb766d5 100644 --- a/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx +++ b/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx @@ -1,10 +1,11 @@ -import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx'; +import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; +import Word from '../../Word.jsx'; import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; // Cares for proper sub-item spacing/leveling -export default class DetectListLevels extends ToTextItemBlockTransformation { +export default class DetectListLevels extends ToLineItemBlockTransformation { constructor() { super("Level Lists"); @@ -21,23 +22,25 @@ export default class DetectListLevels extends ToTextItemBlockTransformation { var currentLevel = 0; const xByLevel = {}; var modifiedBlock = false; - listBlock.textItems.forEach(textItem => { + listBlock.items.forEach(item => { const isListItem = true; if (lastItemX && isListItem) { - if (textItem.x > lastItemX) { + if (item.x > lastItemX) { currentLevel++; - xByLevel[textItem.x] = currentLevel; - } else if (textItem.x < lastItemX) { - currentLevel = xByLevel[textItem.x]; + xByLevel[item.x] = currentLevel; + } else if (item.x < lastItemX) { + currentLevel = xByLevel[item.x]; } } else { - xByLevel[textItem.x] = 0; + xByLevel[item.x] = 0; } if (currentLevel > 0) { - textItem.text = ' '.repeat(currentLevel * 3) + textItem.text; + item.words = [new Word({ + string: ' '.repeat(currentLevel * 3) + })].concat(item.words); modifiedBlock = true; } - lastItemX = textItem.x; + lastItemX = item.x; }); listBlocks++; if (modifiedBlock) { diff --git a/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx b/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx index 0589739..6e179f2 100644 --- a/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx +++ b/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx @@ -1,11 +1,11 @@ -import ToTextItemBlockTransformation from '../ToTextItemBlockTransformation.jsx'; +import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import TextItemBlock from '../../TextItemBlock.jsx'; +import LineItemBlock from '../../LineItemBlock.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; -import { minXFromTextItems } from '../../../textItemFunctions.jsx'; +import { minXFromPageItems } from '../../../pageItemFunctions.jsx'; // Gathers lines to blocks -export default class GatherBlocks extends ToTextItemBlockTransformation { +export default class GatherBlocks extends ToLineItemBlockTransformation { constructor() { super("Gather Blocks"); @@ -14,29 +14,29 @@ export default class GatherBlocks extends ToTextItemBlockTransformation { transform(parseResult:ParseResult) { const {mostUsedDistance} = parseResult.globals; var createdBlocks = 0; - var textItems = 0; + var lineItemCount = 0; parseResult.pages.map(page => { - textItems += page.items.length; + lineItemCount += page.items.length; const blocks = []; - var stashedBlock = new TextItemBlock({}); + var stashedBlock = new LineItemBlock({}); const flushStashedItems = () => { - if (stashedBlock.textItems.length > 1) { + if (stashedBlock.items.length > 1) { stashedBlock.annotation = DETECTED_ANNOTATION; } blocks.push(stashedBlock); - stashedBlock = new TextItemBlock({}); + stashedBlock = new LineItemBlock({}); createdBlocks++; }; - var minX = minXFromTextItems(page.items); + var minX = minXFromPageItems(page.items); page.items.forEach(item => { - if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) { + if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) { flushStashedItems(); } - stashedBlock.addTextItem(item); + stashedBlock.addItem(item); }); - if (stashedBlock.textItems.length > 0) { + if (stashedBlock.items.length > 0) { flushStashedItems(); } page.items = blocks; @@ -44,7 +44,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation { return new ParseResult({ ...parseResult, - messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items'] + messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items'] }); } @@ -54,7 +54,7 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) { if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) { return false; } - const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1]; + const lastItem = stashedBlock.items[stashedBlock.items.length - 1]; const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance); if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) { return false; diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/pageItemFunctions.jsx similarity index 58% rename from src/javascript/textItemFunctions.jsx rename to src/javascript/pageItemFunctions.jsx index 8c33ffc..562e262 100644 --- a/src/javascript/textItemFunctions.jsx +++ b/src/javascript/pageItemFunctions.jsx @@ -1,10 +1,10 @@ -import TextItemBlock from './models/TextItemBlock.jsx'; -import TextItem from './models/TextItem.jsx'; +import PageItem from './models/PageItem.jsx'; +import LineItemBlock from './models/LineItemBlock.jsx'; -export function minXFromBlocks(blocks:TextItemBlock[]) { +export function minXFromBlocks(blocks:LineItemBlock[]) { var minX = 999; blocks.forEach(block => { - block.textItems.forEach(item => { + block.items.forEach(item => { minX = Math.min(minX, item.x) }); }); @@ -14,7 +14,7 @@ export function minXFromBlocks(blocks:TextItemBlock[]) { return minX; } -export function minXFromTextItems(items:TextItem) { +export function minXFromPageItems(items:PageItem) { var minX = 999; items.forEach(item => { minX = Math.min(minX, item.x) @@ -25,13 +25,13 @@ export function minXFromTextItems(items:TextItem) { return minX; } -export function sortByX(items:TextItem) { +export function sortByX(items:PageItem) { items.sort((a, b) => { return a.x - b.x; }); } -export function sortCopyByX(items:TextItem) { +export function sortCopyByX(items:PageItem) { const copy = items.concat(); sortByX(copy); return copy; diff --git a/test/HeadlineFinder.spec.js b/test/HeadlineFinder.spec.js index 629c4b0..22bf00a 100644 --- a/test/HeadlineFinder.spec.js +++ b/test/HeadlineFinder.spec.js @@ -1,31 +1,30 @@ import { expect } from 'chai'; import HeadlineFinder from '../src/javascript/models/HeadlineFinder'; -import TextItem from '../src/javascript/models/TextItem.jsx'; +import LineItem from '../src/javascript/models/LineItem.jsx'; describe('HeadlineFinder', () => { - it('Not Found - Case 1', () => { const headlineFinder = new HeadlineFinder({ headline: 'My Little Headline' }); - const item1 = new TextItem({ + const item1 = new LineItem({ text: 'My ' }); - const item2 = new TextItem({ + const item2 = new LineItem({ text: 'Little' }); - const item3 = new TextItem({ + const item3 = new LineItem({ text: ' Headline2' }); expect(headlineFinder.consume(item1)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.consume(item2)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.consume(item3)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(0); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(0); }); @@ -33,22 +32,22 @@ describe('HeadlineFinder', () => { const headlineFinder = new HeadlineFinder({ headline: 'My Little Headline' }); - const item1 = new TextItem({ + const item1 = new LineItem({ text: 'My ' }); - const item2 = new TextItem({ + const item2 = new LineItem({ text: 'Little' }); - const item3 = new TextItem({ + const item3 = new LineItem({ text: ' Headline' }); expect(headlineFinder.consume(item1)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.consume(item2)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); }); @@ -56,27 +55,27 @@ describe('HeadlineFinder', () => { const headlineFinder = new HeadlineFinder({ headline: 'My Little Headline' }); - const item0 = new TextItem({ + const item0 = new LineItem({ text: 'Waste ' }); - const item1 = new TextItem({ + const item1 = new LineItem({ text: 'My ' }); - const item2 = new TextItem({ + const item2 = new LineItem({ text: 'Little' }); - const item3 = new TextItem({ + const item3 = new LineItem({ text: ' Headline' }); expect(headlineFinder.consume(item0)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(0); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(0); expect(headlineFinder.consume(item1)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.consume(item2)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); }); @@ -84,27 +83,27 @@ describe('HeadlineFinder', () => { const headlineFinder = new HeadlineFinder({ headline: 'My Little Headline' }); - const item0 = new TextItem({ + const item0 = new LineItem({ text: 'My ' }); - const item1 = new TextItem({ + const item1 = new LineItem({ text: 'My ' }); - const item2 = new TextItem({ + const item2 = new LineItem({ text: 'Little' }); - const item3 = new TextItem({ + const item3 = new LineItem({ text: ' Headline' }); expect(headlineFinder.consume(item0)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item0); expect(headlineFinder.consume(item1)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.consume(item2)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); }); @@ -112,22 +111,22 @@ describe('HeadlineFinder', () => { const headlineFinder = new HeadlineFinder({ headline: 'MYLitt le HEADline' }); - const item1 = new TextItem({ + const item1 = new LineItem({ text: 'My ' }); - const item2 = new TextItem({ + const item2 = new LineItem({ text: 'Little' }); - const item3 = new TextItem({ + const item3 = new LineItem({ text: ' Headline' }); expect(headlineFinder.consume(item1)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.consume(item2)).to.equal(null); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); - expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); + expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); }); diff --git a/test/functions.spec.js b/test/functions.spec.js index 221b04c..8aa553e 100644 --- a/test/functions.spec.js +++ b/test/functions.spec.js @@ -2,9 +2,10 @@ import { expect } from 'chai'; import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx' -describe('hasUpperCaseCharacterInMiddleOfWord', () => { +describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => { it('single word', () => { + expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false); expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false); @@ -38,7 +39,7 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => { }); }); -describe('removeLeadingWhitespaces', () => { +describe('functions: removeLeadingWhitespaces', () => { it('No Removes', () => { expect(removeLeadingWhitespaces(".")).to.be.equal("."); expect(removeLeadingWhitespaces(". ")).to.be.equal(". "); @@ -54,7 +55,7 @@ describe('removeLeadingWhitespaces', () => { }); -describe('removeTrailingWhitespaces', () => { +describe('functions: removeTrailingWhitespaces', () => { it('No Removes', () => { expect(removeTrailingWhitespaces(".")).to.be.equal("."); expect(removeTrailingWhitespaces(" .")).to.be.equal(" ."); @@ -71,7 +72,7 @@ describe('removeTrailingWhitespaces', () => { }); -describe('prefixAfterWhitespace', () => { +describe('functions: prefixAfterWhitespace', () => { it('Basic', () => { expect(prefixAfterWhitespace('1', '2')).to.be.equal('12'); expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12'); @@ -81,7 +82,7 @@ describe('prefixAfterWhitespace', () => { }); }); -describe('suffixBeforeWhitespace', () => { +describe('functions: suffixBeforeWhitespace', () => { it('Basic', () => { expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. '); expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.'); @@ -92,7 +93,7 @@ describe('suffixBeforeWhitespace', () => { }); -describe('charCodeArray', () => { +describe('functions: charCodeArray', () => { it('Charcodes', () => { expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46); }); @@ -105,7 +106,7 @@ describe('charCodeArray', () => { }); -describe('normalizedCharCodeArray', () => { +describe('functions: normalizedCharCodeArray', () => { it('No Change', () => { expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD"); @@ -131,7 +132,7 @@ describe('normalizedCharCodeArray', () => { }); -describe('isListItem', () => { +describe('functions: isListItem', () => { it('Match', () => { expect(isListItem('- my text')).to.equal(true); @@ -154,7 +155,7 @@ describe('isListItem', () => { }); -describe('isNumberedListItem', () => { +describe('functions: isNumberedListItem', () => { it('Match', () => { expect(isNumberedListItem('1. my text')).to.equal(true); @@ -173,7 +174,7 @@ describe('isNumberedListItem', () => { }); -describe('wordsMatch', () => { +describe('functions: wordsMatch', () => { it('Match', () => { expect(wordMatch('text 1', 'text 1')).to.equal(1.0); diff --git a/test/models/StashingStream.spec.js b/test/models/StashingStream.spec.js new file mode 100644 index 0000000..538a4a9 --- /dev/null +++ b/test/models/StashingStream.spec.js @@ -0,0 +1,64 @@ +import { expect } from 'chai'; + +import StashingStream from '../../src/javascript/models/StashingStream'; +import TextItem from '../../src/javascript/models/TextItem.jsx'; + +describe('StashingStream', () => { + + it('Simple', () => { + const stream = new MyStashingStream(); + + stream.consume('a'); + stream.consume('b'); + stream.consume('a'); + stream.consume('a'); + stream.consume('z'); + stream.consume('m'); + stream.consume('m'); + stream.consume('z'); + stream.consume('z'); + stream.consume('c'); + stream.consume('e'); + stream.consume('f'); + stream.consume('m'); + stream.consume('a'); + + const resultsAsString = stream.complete().join(''); + + expect(resultsAsString).to.equal('AbAAZZZcefA'); + expect(stream.transformedItems).to.equal(10); + }); + + it('ConsumeAll', () => { + const items = ['k', 'k', 'x', 'a', 'm', 'z', 'o', 'p'] + const stream = new MyStashingStream(); + stream.consumeAll(items); + + const resultsAsString = stream.complete().join(''); + expect(resultsAsString).to.equal('kkxAZop'); + expect(stream.transformedItems).to.equal(3); + }); + +}); + + +class MyStashingStream extends StashingStream { + + constructor() { + super(); + this.transformedItems = 0; + } + + shouldStash(item) { + return item === 'a' || item === 'z' || item === 'm'; + } + + doMatchesStash(lastItem, item) { + return lastItem === item; + } + + doFlushStash(stash, results) { + this.transformedItems += stash.length; + results.push(...stash.filter(elem => elem !== 'm').map(item => item.toUpperCase())); + } +} \ No newline at end of file