WIP Introduce word/wordType/lineItem

* Way to do the markdown transformation of inline formats (bold, italic, link, footnote, etc..) at the end and not in the middle
* Introduce StashingStream as a helper
This commit is contained in:
Johannes Zillmann 2017-03-27 07:34:58 +02:00
parent fde670e83f
commit 09facb09b4
36 changed files with 917 additions and 661 deletions

View File

@ -7,7 +7,7 @@
"watch": "webpack -d --watch", "watch": "webpack -d --watch",
"build": "webpack", "build": "webpack",
"lint": "eslint src --ext .js --ext .jsx --cache", "lint": "eslint src --ext .js --ext .jsx --cache",
"test": "mocha --compilers js:babel-core/register test/*.spec.js", "test": "mocha --compilers js:babel-core/register test --recursive",
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
"deploy": "npm run release && cp -r build/* docs/" "deploy": "npm run release && cp -r build/* docs/"
}, },

View File

@ -1,13 +1,12 @@
import React from 'react'; import React from 'react';
import PageView from './PageView.jsx'; import PageView from './PageView.jsx';
import TextItemTable from './TextItemTable.jsx'; import LineItemTable from './LineItemTable.jsx';
// View for a Page which items are of kind TextItemBlock // View for a Page which items are of kind LineItemBlock
export default class TextItemBlockPageView extends PageView { export default class LineItemBlockPageView extends PageView {
createItemViews(items, showWhitespaces) { createItemViews(items, showWhitespaces) {
const blockTables = items.map((block, i) => { const blockTables = items.map((block, i) => {
var textItems = block.textItems;
const blockType = block.type ? ' - ' + block.type.name : null; const blockType = block.type ? ' - ' + block.type.name : null;
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span> const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
: null; : null;
@ -38,7 +37,7 @@ export default class TextItemBlockPageView extends PageView {
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i> <b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
</div> </div>
<div style={ borderStyle }> <div style={ borderStyle }>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } /> <LineItemTable items={ block.items } showWhitespaces={ showWhitespaces } />
{ footnoteLinks } { footnoteLinks }
{ footnotes } { footnotes }
</div> </div>

View File

@ -0,0 +1,12 @@
import React from 'react';
import PageView from './PageView.jsx';
import LineItemTable from './LineItemTable.jsx';
// View for a Page which items are of kind LineItem
export default class LineItemPageView extends PageView {
createItemViews(items, showWhitespaces) {
return <LineItemTable items={ items } showWhitespaces={ showWhitespaces } />
}
}

View File

@ -0,0 +1,108 @@
import React from 'react';
import Table from 'react-bootstrap/lib/Table'
// Displays an array of LineItem as a table
export default class LineItemTable extends React.Component {
static propTypes = {
items: React.PropTypes.array.isRequired,
showWhitespaces: React.PropTypes.bool
};
render() {
const {showWhitespaces, items} = this.props;
const tableHeader = <thead>
<tr>
<th>
#
</th>
<th>
Text
</th>
<th>
X
</th>
<th>
Y
</th>
<th>
Width
</th>
<th>
Height
</th>
</tr>
</thead>
const itemRows = items.map((item, i) => <tr key={ i } style={ item.annotation ? {
color: item.annotation.color
} : null }>
<td>
<div style={ { textAlign: 'center' } }>
{ i }
</div>
<div style={ { textAlign: 'center' } }>
{ item.annotation ? item.annotation.category : '' }
</div>
<div style={ { textAlign: 'center', color: 'brown' } }>
{ item.type ? item.type.name : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ item.parsedElements && item.parsedElements.footnoteLinks.length > 0 ? <div>
Footnote-Link
</div> : '' }
{ item.parsedElements && item.parsedElements.containLinks ? <div>
Link
</div> : '' }
{ item.lineFormat ? <div>
{ item.lineFormat.name }
</div> : '' }
{ item.unopenedFormat ? <div>
Unopened
{ ' ' + item.unopenedFormat.name }
</div> : '' }
{ item.parsedElements && item.parsedElements.inlineFormats > 0 ? <div>
{ item.parsedElements.inlineFormats + 'x Bold/Italic' }
</div> : '' }
{ item.unclosedFormat ? <div>
Unclosed
{ ' ' + item.unclosedFormat.name }
</div> : '' }
</div>
</td>
<td>
{ showWhitespaces ? (
<pre style={ item.annotation ? {
color: item.annotation.color,
display: 'inline-block',
} : {
display: 'inline-block'
} }>{ item.text() }</pre>
) : (item.text()) }
</td>
<td>
{ item.x }
</td>
<td>
{ item.y }
</td>
<td>
{ item.width }
</td>
<td>
{ item.height }
</td>
</tr>
)
return (
<Table responsive condensed bordered>
{ tableHeader }
<tbody>
{ itemRows }
</tbody>
</Table>
);
}
}

View File

@ -18,6 +18,17 @@ export function isNumber(string) {
return true; return true;
} }
export function hasOnly(string, char) {
const charCode = char.charCodeAt(0);
for (var i = 0; i < string.length; i++) {
const aCharCode = string.charCodeAt(i);
if (aCharCode != charCode) {
return false;
}
}
return true;
}
export function hasUpperCaseCharacterInMiddleOfWord(text) { export function hasUpperCaseCharacterInMiddleOfWord(text) {
var beginningOfWord = true; var beginningOfWord = true;
for (var i = 0; i < text.length; i++) { for (var i = 0; i < text.length; i++) {

View File

@ -1,6 +1,7 @@
import { Enum } from 'enumify'; import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx'; import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
import CompactLines from './transformations/textitem/CompactLines.jsx'; import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
@ -56,10 +57,10 @@ export default class AppState {
new CompactLines(), new CompactLines(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new PostprocessLines(), // new PostprocessLines(),
new DetectTOC(), new DetectTOC(),
new DetectHeaders(), new DetectHeaders(),
new CompleteFormats(), // new CompleteFormats(),
new DetectListItems(), new DetectListItems(),
new GatherBlocks(), new GatherBlocks(),

View File

@ -1,83 +1,85 @@
import { Enum } from 'enumify'; import { Enum } from 'enumify';
import TextItem from './TextItem.jsx'; import LineItem from './LineItem.jsx';
import TextItemBlock from './TextItemBlock.jsx'; import LineItemBlock from './LineItemBlock.jsx';
// An Markdown element // An Markdown element
export default class ElementType extends Enum { export default class ElementType extends Enum {
} }
//TODO rename to BlockType
ElementType.initEnum({ ElementType.initEnum({
H1: { H1: {
headline: true, headline: true,
headlineLevel: 1, headlineLevel: 1,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '# ' + concatTextItems(block.textItems); return '# ' + concatLineItems(block.items);
} }
}, },
H2: { H2: {
headline: true, headline: true,
headlineLevel: 2, headlineLevel: 2,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '## ' + concatTextItems(block.textItems); return '## ' + concatLineItems(block.items);
} }
}, },
H3: { H3: {
headline: true, headline: true,
headlineLevel: 3, headlineLevel: 3,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '### ' + concatTextItems(block.textItems); return '### ' + concatLineItems(block.items);
} }
}, },
H4: { H4: {
headline: true, headline: true,
headlineLevel: 4, headlineLevel: 4,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '#### ' + concatTextItems(block.textItems); return '#### ' + concatLineItems(block.items);
} }
}, },
H5: { H5: {
headline: true, headline: true,
headlineLevel: 5, headlineLevel: 5,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '##### ' + concatTextItems(block.textItems); return '##### ' + concatLineItems(block.items);
} }
}, },
H6: { H6: {
headline: true, headline: true,
headlineLevel: 6, headlineLevel: 6,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '###### ' + concatTextItems(block.textItems); return '###### ' + concatLineItems(block.items);
} }
}, },
TOC: { TOC: {
mergeToBlock: true, mergeToBlock: true,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return concatTextItems(block.textItems); return concatLineItems(block.items);
} }
}, },
FOOTNOTES: { FOOTNOTES: {
mergeToBlock: true, mergeToBlock: true,
mergeFollowingNonTypedItems: true, mergeFollowingNonTypedItems: true,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return concatTextItems(block.textItems); return concatLineItems(block.items);
} }
}, },
CODE: { CODE: {
mergeToBlock: true, mergeToBlock: true,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return '```\n' + concatTextItems(block.textItems) + '```' return '```\n' + concatLineItems(block.items) + '```'
} }
}, },
LIST: { LIST: {
mergeToBlock: true, mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true, mergeFollowingNonTypedItemsWithSmallDistance: true,
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return concatTextItems(block.textItems); return concatLineItems(block.items);
} }
}, },
PARAGRAPH: { PARAGRAPH: {
toText(block:TextItemBlock) { toText(block:LineItemBlock) {
return concatTextItems(block.textItems); return concatLineItems(block.items);
} }
} }
}); });
@ -86,17 +88,17 @@ export function isHeadline(elementType: ElementType) {
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H' return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
} }
export function blockToText(block: TextItemBlock) { export function blockToText(block: LineItemBlock) {
if (!block.type) { if (!block.type) {
return concatTextItems(block.textItems); return concatLineItems(block.items);
} }
return block.type.toText(block); return block.type.toText(block);
} }
function concatTextItems(textItems: TextItem[]) { function concatLineItems(lineItems: LineItem[]) {
var text = ''; var text = '';
textItems.forEach(item => { lineItems.forEach(item => {
text += item.text + '\n'; text += item.text() + '\n';
}); });
return text; return text;
} }

View File

@ -4,24 +4,25 @@ export default class HeadlineFinder {
constructor(options) { constructor(options) {
this.headlineCharCodes = normalizedCharCodeArray(options.headline); this.headlineCharCodes = normalizedCharCodeArray(options.headline);
this.stackedTextItems = []; this.stackedLineItems = [];
this.stackedChars = 0; this.stackedChars = 0;
} }
consume(textItem) { consume(lineItem) {
const normalizedCharCodes = normalizedCharCodeArray(textItem.text); //TODO avoid join
const normalizedCharCodes = normalizedCharCodeArray(lineItem.text());
const matchAll = this.matchAll(normalizedCharCodes); const matchAll = this.matchAll(normalizedCharCodes);
if (matchAll) { if (matchAll) {
this.stackedTextItems.push(textItem); this.stackedLineItems.push(lineItem);
this.stackedChars += normalizedCharCodes.length; this.stackedChars += normalizedCharCodes.length;
if (this.stackedChars == this.headlineCharCodes.length) { if (this.stackedChars == this.headlineCharCodes.length) {
return this.stackedTextItems; return this.stackedLineItems;
} }
} else { } else {
if (this.stackedChars > 0) { if (this.stackedChars > 0) {
this.stackedChars = 0; this.stackedChars = 0;
this.stackedTextItems = []; this.stackedLineItems = [];
this.consume(textItem); // test again without stack this.consume(lineItem); // test again without stack
} }
} }
return null; return null;

View File

@ -0,0 +1,145 @@
import TextItem from './TextItem.jsx';
import Word from './Word.jsx';
import WordType from './markdown/WordType.jsx';
import LineItem from './LineItem.jsx';
import StashingStream from './StashingStream.jsx';
import { ParsedElements } from './PageItem.jsx';
import { isNumber } from '../functions.jsx'
import { sortByX } from '../pageItemFunctions.jsx'
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class LineConverter {
constructor(fontToFormats) {
this.fontToFormats = fontToFormats;
}
// returns a CombineResult
compact(textItems: TextItem[]) {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(textItems);
const wordStream = new WordDetectionStream(this.fontToFormats);
wordStream.consumeAll(textItems.map(item => new TextItem({
...item
})));
const words = wordStream.complete();
var maxHeight = 0;
var widthSum = 0;
textItems.forEach(item => {
maxHeight = Math.max(maxHeight, item.height);
widthSum += item.width;
});
return new LineItem({
x: textItems[0].x,
y: textItems[0].y,
height: maxHeight,
width: widthSum,
words: words,
parsedElements: new ParsedElements({
footnoteLinks: wordStream.footnoteLinks,
footnotes: wordStream.footnotes
})
});
}
}
function itemsToWords(items, format) {
const combinedText = combineText(items);
// const combinedText = items.map(textItem => textItem.text).join('');
const words = combinedText.split(' ');
return words.filter(w => w.trim().length > 0).map(word => {
return new Word({
string: word,
type: format
});
});
}
function combineText(textItems) {
var text = '';
var lastItem;
textItems.forEach(textItem => {
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
const xDistance = textItem.x - lastItem.x - lastItem.width;
if (xDistance > 5) {
text += ' ';
}
}
text += textItem.text;
lastItem = textItem;
});
return text;
}
class WordDetectionStream extends StashingStream {
constructor(fontToFormats) {
super();
this.fontToFormats = fontToFormats;
this.footnoteLinks = [];
this.footnotes = [];
this.firstY;
this.stashedNumber = false;
this.currentItem;
}
shouldStash(item) { // eslint-disable-line no-unused-vars
if (!this.firstY) {
this.firstY = item.y;
}
this.currentItem = item;
return true;
}
onPushOnStash(item) { // eslint-disable-line no-unused-vars
this.stashedNumber = isNumber(item.text.trim());
}
doMatchesStash(lastItem, item) {
const lastItemFormat = this.fontToFormats.get(lastItem.font);
const itemFormat = this.fontToFormats.get(item.font);
if (lastItemFormat !== itemFormat) {
return false;
}
const itemIsANumber = isNumber(item.text.trim());
return this.stashedNumber == itemIsANumber;
}
doFlushStash(stash, results) {
if (this.stashedNumber) {
const joinedNumber = stash.map(item => item.text).join('');
if (stash[0].y > this.firstY) { // footnote link
results.push(new Word({
string: `${joinedNumber}`,
type: WordType.FOOTNOTE_LINK
//TODO format to
//^
//`<sup>[${joinedNumber}](#${joinedNumber})</sup>`
}));
this.footnoteLinks.push(parseInt(joinedNumber));
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
results.push(new Word({
string: `${joinedNumber}`,
type: WordType.FOOTNOTE
//TODO format to (^${ joinedNumber}):
}));
this.footnotes.push(joinedNumber);
} else {
this.copyStashItemsAsText(stash, results);
}
} else {
this.copyStashItemsAsText(stash, results);
}
}
copyStashItemsAsText(stash, results) {
const format = this.fontToFormats.get(stash[0].font);
results.push(...itemsToWords(stash, format));
}
}

View File

@ -0,0 +1,29 @@
import PageItem from './PageItem.jsx'
import Word from './Word.jsx'
//A line within a page
export default class LineItem extends PageItem {
constructor(options) {
super(options);
this.x = options.x;
this.y = options.y;
this.width = options.width;
this.height = options.height;
this.words = options.words || [];
if (options.text && !options.words) {
this.words = options.text.split(" ").filter(string => string.trim().length > 0).map(wordAsString => new Word({
string: wordAsString
}));
}
}
text() {
return this.wordStrings().join(" ");
}
wordStrings() {
return this.words.map(word => word.string);
}
}

View File

@ -0,0 +1,36 @@
import PageItem from './PageItem.jsx'
import LineItem from './LineItem.jsx'
// A block of LineItem[] within a Page
export default class LineItemBlock extends PageItem {
constructor(options) {
super(options);
this.items = [];
if (options.items) {
options.items.forEach(item => this.addItem(item));
}
}
addItem(item:LineItem) {
if (this.type && item.type && this.type !== item.type) {
throw `Adding item of type ${item.type} to block of type ${this.type}`
}
if (!this.type) {
this.type = item.type;
}
if (item.parsedElements) {
if (this.parsedElements) {
this.parsedElements.add(item.parsedElements);
} else {
this.parsedElements = item.parsedElements;
}
}
const copiedItem = new LineItem({
...item
});
copiedItem.type = null;
this.items.push(copiedItem);
}
}

View File

@ -1,4 +1,4 @@
// A abstract PageItem class, can be TextItem, or TextItemBlock // A abstract PageItem class, can be TextItem, LineItem or LineItemBlock
export default class PageItem { export default class PageItem {
constructor(options) { constructor(options) {

View File

@ -0,0 +1,73 @@
//Abstract stream which allows stash items temporarily
export default class StashingStream {
constructor() {
if (this.constructor === StashingStream) {
throw new TypeError("Can not construct abstract class.");
}
this.results = [];
this.stash = [];
}
consumeAll(items) {
items.forEach(item => this.consume(item));
}
consume(item) {
if (this.shouldStash(item)) {
if (!this.matchesStash(item)) {
this.flushStash();
}
this.pushOnStash(item);
} else {
if (this.stash.length > 0) {
this.flushStash();
}
this.results.push(item);
}
}
pushOnStash(item) {
this.onPushOnStash(item);
this.stash.push(item);
}
complete() {
if (this.stash.length > 0) {
this.flushStash();
}
return this.results;
}
// return true if the item matches the items of the stack
matchesStash(item) {
if (this.stash.length == 0) {
return true;
}
const lastItem = this.stash[this.stash.length - 1];
return this.doMatchesStash(lastItem, item);
}
flushStash() {
if (this.stash.length > 0) {
this.doFlushStash(this.stash, this.results);
this.stash = [];
}
}
onPushOnStash(item) { // eslint-disable-line no-unused-vars
//sub-classes may override
}
shouldStash(item) {
throw new TypeError("Do not call abstract method foo from child." + item);
}
doMatchesStash(lastItem, item) {
throw new TypeError("Do not call abstract method foo from child." + lastItem + item);
}
doFlushStash(stash, results) {
throw new TypeError("Do not call abstract method foo from child." + stash + results);
}
}

View File

@ -11,8 +11,6 @@ export default class TextItem extends PageItem {
this.height = options.height; this.height = options.height;
this.text = options.text; this.text = options.text;
this.font = options.font; this.font = options.font;
this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent;
this.lineFormat = options.lineFormat; this.lineFormat = options.lineFormat;
this.unopenedFormat = options.unopenedFormat; this.unopenedFormat = options.unopenedFormat;

View File

@ -1,36 +0,0 @@
import PageItem from './PageItem.jsx'
import TextItem from './TextItem.jsx'
// A block of TextItem[] within a Page
export default class TextItemBlock extends PageItem {
constructor(options) {
super(options);
this.textItems = [];
if (options.textItems) {
options.textItems.forEach(item => this.addTextItem(item));
}
}
addTextItem(textItem:TextItem) {
if (this.type && textItem.type && this.type !== textItem.type) {
throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
}
if (!this.type) {
this.type = textItem.type;
}
if (textItem.parsedElements) {
if (this.parsedElements) {
this.parsedElements.add(textItem.parsedElements);
} else {
this.parsedElements = textItem.parsedElements;
}
}
const copiedTextItem = new TextItem({
...textItem
});
copiedTextItem.type = null;
this.textItems.push(copiedTextItem);
}
}

View File

@ -1,227 +0,0 @@
import TextItem from './TextItem.jsx';
import { ParsedElements } from './PageItem.jsx';
import { isNumber } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemLineCompactor {
constructor(fontToFormats) {
this.fontToFormats = fontToFormats;
}
// returns a CombineResult
compact(lineItems: TextItem[]) {
if (lineItems.length < 2) {
throw "Must be at least 2 line items, but was " + lineItems;
}
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
const formatter = new Formatter(this.fontToFormats);
var [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
resolvedLineItems.forEach(item => formatter.consume(item));
resolvedLineItems = formatter.getResults();
parsedElements.inlineFormats = formatter.inlineFormats;
// const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
var combinedItem;
if (resolvedLineItems.length == 1) {
combinedItem = resolvedLineItems[0];
} else {
var text = '';
var maxHeight = 0;
var widthSum = 0;
var lastItem;
resolvedLineItems.forEach(item => {
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) {
text += ' ';
}
}
text += item.text;
widthSum += item.width;
lastItem = item;
maxHeight = Math.max(maxHeight, item.height);
});
combinedItem = new TextItem({
...resolvedLineItems[0],
text: text,
height: maxHeight,
width: widthSum
});
}
combinedItem.parsedElements = parsedElements;
combinedItem.lineFormat = formatter.lineFormat;
combinedItem.unopenedFormat = formatter.unopenedFormat;
combinedItem.unclosedFormat = formatter.unclosedFormat;
return combinedItem;
}
resolveSpecialElements(lineItems) {
const footnoteLinks = [];
const footnotes = [];
const basicY = lineItems[0].y;
const newLineItems = [];
var stashedNumberItems = [];
const commitStashedNumbers = (nextItem) => {
if (stashedNumberItems.length > 0) {
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
if (stashedNumberItems[0].y > basicY) { // footnote link
newLineItems.push(new TextItem({
...stashedNumberItems[0],
//TODO make fomatting configurable
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `^${joinedNumber}`
}));
footnoteLinks.push(parseInt(joinedNumber));
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}): `
}));
footnotes.push(joinedNumber);
} else {
stashedNumberItems.forEach(number => newLineItems.push(number));
}
stashedNumberItems = [];
}
};
lineItems.forEach(item => {
if (newLineItems.length == 0 && item.text.trim().length == 0) {
// skip whitespace on the beginning of a line
} else {
const isANumber = isNumber(item.text.trim());
if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
}
}
});
commitStashedNumbers();
return [newLineItems, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
}
class Formatter {
constructor(fontToFormats) {
this.fontToFormats = fontToFormats;
this.resultItems = [];
this.lineFormat;
this.unopenedFormat;
this.unclosedFormat;
this.openFormat;
this.stashedItems = [];
this.inlineFormats = 0;
this.lastItem;
}
consume(item) {
const formatType = this.fontToFormats.get(item.font);
if (this.openFormat && formatType !== this.openFormat) {
this.flushStash(false);
}
if (formatType.needFormat) {
this.openFormat = formatType;
this.stashedItems.push(item);
} else {
this.resultItems.push(item);
}
}
getResults() {
if (this.openFormat) {
this.flushStash(true);
}
return this.resultItems;
}
flushStash(formatToEndOfLine) {
const formatFromBeginningOfLine = this.resultItems == 0;
if (formatFromBeginningOfLine) {
if (formatToEndOfLine) {
this.lineFormat = this.openFormat;
this.moveStashItemsToResult();
} else {
this.unopenedFormat = this.openFormat;
const newLastItem = this.newClosingItem(this.stashedItems.pop());
this.moveStashItemsToResult();
this.resultItems.push(newLastItem);
}
} else {
if (formatToEndOfLine) {
this.unclosedFormat = this.openFormat;
const newFirstItem = this.newOpeningItem(this.stashedItems.shift());
this.resultItems.push(newFirstItem);
this.moveStashItemsToResult();
} else {
this.inlineFormats++;
if (this.stashedItems.length == 1) {
const onlyItem = this.stashedItems.pop();
if (onlyItem.text.trim().length > 0) {
const onlyItemFormatted = this.newCompleteItem(onlyItem);
this.resultItems.push(onlyItemFormatted);
}
this.moveStashItemsToResult();
} else {
const firstItem = this.newOpeningItem(this.stashedItems.shift());
const lastItem = this.newClosingItem(this.stashedItems.pop());
this.resultItems.push(firstItem);
this.moveStashItemsToResult();
this.resultItems.push(lastItem);
}
}
}
}
moveStashItemsToResult() {
this.resultItems.push(...this.stashedItems);
this.stashedItems = [];
this.openFormat = null;
}
newOpeningItem(item) {
return new TextItem({
...item,
text: prefixAfterWhitespace(this.openFormat.startSymbol, item.text)
});
}
newClosingItem(item) {
return new TextItem({
...item,
text: suffixBeforeWhitespace(item.text, this.openFormat.endSymbol)
});
}
newCompleteItem(item) {
return new TextItem({
...item,
text: suffixBeforeWhitespace(prefixAfterWhitespace(this.openFormat.startSymbol, item.text), this.openFormat.endSymbol)
});
}
}

View File

@ -1,5 +1,5 @@
import TextItem from './TextItem.jsx'; import TextItem from './TextItem.jsx';
import { sortByX } from '../textItemFunctions.jsx' import { sortByX } from '../pageItemFunctions.jsx'
//Groups all text items which are on the same y line //Groups all text items which are on the same y line
export default class TextItemLineGrouper { export default class TextItemLineGrouper {

View File

@ -0,0 +1,8 @@
export default class Word {
constructor(options) {
this.string = options.string;
this.type = options.type; // WordType
}
}

View File

@ -0,0 +1,7 @@
import { Enum } from 'enumify';
// An Markdown word element
export default class WordType extends Enum {
}
WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']);

View File

@ -1,16 +1,16 @@
import React from 'react'; import React from 'react';
import Transformation from './Transformation.jsx'; import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx'; import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx'; import LineItemBlock from '../LineItemBlock.jsx';
import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx'; import LineItemBlockPageView from '../../components/debug/LineItemBlockPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx'; import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView // Abstract class for transformations producing LineItemBlock(s) to be shown in the LineItemBlockPageView
export default class ToTextItemBlockTransformation extends Transformation { export default class ToLineItemBlockTransformation extends Transformation {
constructor(name) { constructor(name) {
super(name, TextItemBlock.name); super(name, LineItemBlock.name);
if (this.constructor === ToTextItemBlockTransformation) { if (this.constructor === ToLineItemBlockTransformation) {
throw new TypeError("Can not construct abstract class."); throw new TypeError("Can not construct abstract class.");
} }
this.showWhitespaces = false; this.showWhitespaces = false;
@ -25,7 +25,7 @@ export default class ToTextItemBlockTransformation extends Transformation {
} }
createPageView(page, modificationsOnly) { createPageView(page, modificationsOnly) {
return <TextItemBlockPageView return <LineItemBlockPageView
key={ page.index } key={ page.index }
page={ page } page={ page }
modificationsOnly={ modificationsOnly } modificationsOnly={ modificationsOnly }

View File

@ -0,0 +1,46 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import LineItem from '../LineItem.jsx';
import LineItemPageView from '../../components/debug/LineItemPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing LineItem(s) to be shown in the LineItemPageView
export default class ToLineItemTransformation extends Transformation {
constructor(name) {
super(name, LineItem.name);
if (this.constructor === ToLineItemTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}
createPageView(page, modificationsOnly) {
return <LineItemPageView
key={ page.index }
page={ page }
modificationsOnly={ modificationsOnly }
showWhitespaces={ this.showWhitespaces } />;
}
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.messages = [];
parseResult.pages.forEach(page => {
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
page.items.forEach(item => item.annotation = null);
});
return parseResult;
}
}

View File

@ -1,6 +1,7 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import StringFormat from '../../StringFormat.jsx'; import WordType from '../../markdown/WordType.jsx';
// import StringFormat from '../../StringFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation { export default class CalculateGlobalStats extends ToTextItemTransformation {
@ -54,21 +55,21 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
this.fontMap.forEach(function(value, key) { this.fontMap.forEach(function(value, key) {
fontIdToName.push(key + " = " + value.name) fontIdToName.push(key + " = " + value.name)
const fontName = value.name.toLowerCase(); const fontName = value.name.toLowerCase();
var format; var type;
if (key == mostUsedFont) { if (key == mostUsedFont) {
format = StringFormat.STANDARD; type = null;
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) { } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
format = StringFormat.BOLD_OBLIQUE; type = WordType.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) { } else if (fontName.includes('bold')) {
format = StringFormat.BOLD; type = WordType.BOLD;
} else if (fontName.includes('oblique') || fontName.includes('italic')) { } else if (fontName.includes('oblique') || fontName.includes('italic')) {
format = StringFormat.OBLIQUE; type = WordType.OBLIQUE;
} else if (fontName === maxHeightFont) { } else if (fontName === maxHeightFont) {
format = StringFormat.BOLD; type = WordType.BOLD;
} else { }
format = StringFormat.STANDARD; if (type) {
fontToFormats.set(key, type);
} }
fontToFormats.set(key, format);
}); });
fontIdToName.sort(); fontIdToName.sort();

View File

@ -1,16 +1,16 @@
import React from 'react'; import React from 'react';
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { ParsedElements } from '../../PageItem.jsx'; import LineItem from '../../LineItem.jsx';
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx'; import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx'; import LineConverter from '../../LineConverter.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// gathers text items on the same y line to one text item // gathers text items on the same y line to one line item
export default class CompactLines extends ToTextItemTransformation { export default class CompactLines extends ToLineItemTransformation {
constructor() { constructor() {
super("Compact To Lines"); super("Compact To Lines");
@ -20,58 +20,44 @@ export default class CompactLines extends ToTextItemTransformation {
const {mostUsedDistance, fontToFormats} = parseResult.globals; const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = []; const foundFootnotes = [];
const foundFootnoteLinks = []; const foundFootnoteLinks = [];
var inlineFormats = 0; var formattedWords = 0;
var lineFormats = 0;
var unopenedFormats = 0;
var unclosedFormats = 0;
const lineGrouper = new TextItemLineGrouper({ const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance, mostUsedDistance: mostUsedDistance,
}); });
const lineCompactor = new TextItemLineCompactor(fontToFormats); const lineCompactor = new LineConverter(fontToFormats);
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
if (page.items.length > 0) { if (page.items.length > 0) {
const newItems = []; const lineItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items); const textItemsGroupedByLine = lineGrouper.group(page.items);
textItemsGroupedByLine.forEach(textItemsOfLine => { textItemsGroupedByLine.forEach(lineTextItems => {
var lineItem; const lineItem = lineCompactor.compact(lineTextItems);
if (textItemsOfLine.length == 1) { if (lineTextItems.length > 1) {
lineItem = textItemsOfLine[0];
const formatType = fontToFormats.get(lineItem.font);
if (formatType.needFormat) {
lineItem.lineFormat = formatType;
lineItem.parsedElements = new ParsedElements({
completeLineFormats: 1
});
}
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
newItems.push(item);
});
lineItem = lineCompactor.compact(textItemsOfLine);
lineItem.annotation = ADDED_ANNOTATION; lineItem.annotation = ADDED_ANNOTATION;
lineTextItems.forEach(item => {
if (lineItem.parsedElements.footnoteLinks.length > 0) { item.annotation = REMOVED_ANNOTATION;
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>); lineItems.push(new LineItem({
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); ...item
} }));
if (lineItem.parsedElements.footnotes.length > 0) { });
lineItem.type = ElementType.FOOTNOTES; }
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>); if (lineItem.words.length == 0) {
foundFootnotes.push.apply(foundFootnotes, footnotes); lineItem.annotation = REMOVED_ANNOTATION;
} }
inlineFormats += lineItem.parsedElements.inlineFormats; lineItems.push(lineItem);
if (lineItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
if (lineItem.parsedElements.footnotes.length > 0) {
lineItem.type = ElementType.FOOTNOTES;
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes);
} }
if (lineItem.lineFormat) lineFormats++;
if (lineItem.unopenedFormat) unopenedFormats++;
if (lineItem.unclosedFormat) unclosedFormats++;
lineItem.text = lineItem.text.trim();
newItems.push(lineItem);
}); });
page.items = newItems; page.items = lineItems;
} }
}); });
@ -79,11 +65,8 @@ export default class CompactLines extends ToTextItemTransformation {
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: [ messages: [
'Detected ' + lineFormats + ' line formats', 'Detected ' + formattedWords + ' formatted words',
'Detected ' + inlineFormats + ' inline formats', <span>Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }]</span>,
'Detected ' + unclosedFormats + ' opened un-closed formats',
'Detected ' + unopenedFormats + ' un-opened closed formats',
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>, <span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
] ]
}); });

View File

@ -6,6 +6,8 @@ import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../.
//Complete unopened/unclosed bold/italic formats //Complete unopened/unclosed bold/italic formats
export default class CompleteFormats extends ToTextItemTransformation { export default class CompleteFormats extends ToTextItemTransformation {
//TODO move to block and ignore quotes
constructor() { constructor() {
super("Complete Bold/Italics"); super("Complete Bold/Italics");
} }
@ -81,7 +83,6 @@ class ItemStack {
} }
consume(item) { consume(item) {
const te = item.text;
var newItem; var newItem;
const handleFreshUnopened = () => { const handleFreshUnopened = () => {

View File

@ -1,4 +1,4 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
@ -6,7 +6,7 @@ import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem } from '../../../functions.jsx'; import { isListItem } from '../../../functions.jsx';
//Detect items starting with -, , etc... //Detect items starting with -, , etc...
export default class DetectHeaders extends ToTextItemTransformation { export default class DetectHeaders extends ToLineItemTransformation {
constructor() { constructor() {
super("Detect Headers"); super("Detect Headers");
@ -21,15 +21,15 @@ export default class DetectHeaders extends ToTextItemTransformation {
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight); const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
pagesWithMaxHeight.forEach(titlePage => { pagesWithMaxHeight.forEach(titlePage => {
titlePage.items.forEach(textItem => { titlePage.items.forEach(item => {
const height = textItem.height; const height = item.height;
if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) { if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
if (height == maxHeight) { if (height == maxHeight) {
textItem.type = ElementType.H1; item.type = ElementType.H1;
} else { } else {
textItem.type = ElementType.H2; item.type = ElementType.H2;
} }
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
detectedHeaders++; detectedHeaders++;
} }
}); });
@ -41,10 +41,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
var range = headlineTypeToHeightRange[headlineType]; var range = headlineTypeToHeightRange[headlineType];
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(item => {
if (!textItem.type && textItem.height == range.max) { if (!item.type && item.height == range.max) {
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.enumValueOf(headlineType); item.type = ElementType.enumValueOf(headlineType);
detectedHeaders++ detectedHeaders++
} }
}); });
@ -56,10 +56,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
const heights = []; const heights = [];
var lastHeight; var lastHeight;
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(item => {
if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) { if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) { if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
heights.push(textItem.height); heights.push(item.height);
} }
} }
}); });
@ -69,11 +69,11 @@ export default class DetectHeaders extends ToTextItemTransformation {
heights.forEach((height, i) => { heights.forEach((height, i) => {
const headlineType = headlineByLevel(2 + i); const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(item => {
if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) { if (!item.type && item.height == height && !isListItem(item.text())) {
detectedHeaders++; detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
textItem.type = headlineType; item.type = headlineType;
} }
}); });
}); });
@ -83,9 +83,9 @@ export default class DetectHeaders extends ToTextItemTransformation {
//find headlines which have paragraph height //find headlines which have paragraph height
var smallesHeadlineLevel = 1; var smallesHeadlineLevel = 1;
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(item => {
if (textItem.type && textItem.type.headline) { if (item.type && item.type.headline) {
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel); smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
} }
}); });
}); });
@ -93,18 +93,18 @@ export default class DetectHeaders extends ToTextItemTransformation {
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1); const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
var lastItem; var lastItem;
page.items.forEach(textItem => { page.items.forEach(item => {
if (!textItem.type if (!item.type
&& textItem.height == mostUsedHeight && item.height == mostUsedHeight
&& textItem.font !== mostUsedFont && item.font !== mostUsedFont
&& (!lastItem || lastItem.y < textItem.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - textItem.y > mostUsedDistance * 2)) && (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
&& textItem.text === textItem.text.toUpperCase() && item.text() === item.text().toUpperCase()
) { ) {
detectedHeaders++; detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
textItem.type = nextHeadlineType; item.type = nextHeadlineType;
} }
lastItem = textItem; lastItem = item;
}); });
}); });
} }
@ -124,8 +124,8 @@ export default class DetectHeaders extends ToTextItemTransformation {
function findPagesWithMaxHeight(pages, maxHeight) { function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set(); const maxHeaderPagesSet = new Set();
pages.forEach(page => { pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(item => {
if (!textItem.type && textItem.height == maxHeight) { if (!item.type && item.height == maxHeight) {
maxHeaderPagesSet.add(page); maxHeaderPagesSet.add(page);
} }
}); });

View File

@ -1,12 +1,12 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx'; import LineItem from '../../LineItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
//Detect items starting with -, , etc... //Detect items starting with -, , etc...
export default class DetectListItems extends ToTextItemTransformation { export default class DetectListItems extends ToLineItemTransformation {
constructor() { constructor() {
super("Detect List Items"); super("Detect List Items");
@ -16,34 +16,34 @@ export default class DetectListItems extends ToTextItemTransformation {
var foundListItems = 0; var foundListItems = 0;
var foundNumberedItems = 0; var foundNumberedItems = 0;
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
const newTextItems = []; const newItems = [];
page.items.forEach(textItem => { page.items.forEach(item => {
newTextItems.push(textItem); newItems.push(item);
if (!textItem.type) { if (!item.type) {
var text = textItem.text; var text = item.text();
if (isListItem(text)) { if (isListItem(text)) {
foundListItems++ foundListItems++
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length); const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
if (textWithDash === text) { if (textWithDash === text) {
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST; item.type = ElementType.LIST;
} else { } else {
textItem.annotation = REMOVED_ANNOTATION; item.annotation = REMOVED_ANNOTATION;
newTextItems.push(new TextItem({ newItems.push(new LineItem({
...textItem, ...item,
text: textWithDash, text: textWithDash,
annotation: ADDED_ANNOTATION, annotation: ADDED_ANNOTATION,
type: ElementType.LIST type: ElementType.LIST
})); }));
} }
} else if (isNumberedListItem(text)) { } else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
foundNumberedItems++; foundNumberedItems++;
textItem.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
textItem.type = ElementType.LIST; item.type = ElementType.LIST;
} }
} }
}); });
page.items = newTextItems; page.items = newItems;
}); });
return new ParseResult({ return new ParseResult({

View File

@ -1,14 +1,15 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx'; import LineItem from '../../LineItem.jsx';
import Word from '../../Word.jsx';
import HeadlineFinder from '../../HeadlineFinder.jsx'; import HeadlineFinder from '../../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isDigit, wordMatch } from '../../../functions.jsx' import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx'
//Detect table of contents pages //Detect table of contents pages plus linked headlines
export default class DetectTOC extends ToTextItemTransformation { export default class DetectTOC extends ToLineItemTransformation {
constructor() { constructor() {
super("Detect TOC"); super("Detect TOC");
@ -17,64 +18,68 @@ export default class DetectTOC extends ToTextItemTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const tocPages = []; const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const linkLeveler = new LinkLeveler(); const linkLeveler = new LinkLeveler();
var tocLinks = []; var tocLinks = [];
var lastTocPage; var lastTocPage;
var headlineItem; var headlineItem;
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
const lineItemsWithDigits = []; var lineItemsWithDigits = 0;
const unknownLines = new Set(); const unknownLines = new Set();
const pageTocLinks = []; const pageTocLinks = [];
var lastLineTextWithoutNumber; var lastWordsWithoutNumber;
var lastLine; var lastLine;
//find lines ending with a number per page
page.items.forEach(line => { page.items.forEach(line => {
var lineText = line.text.replace(/\./g, '').trim(); var words = line.words.filter(word => !hasOnly(word.string, '.'));
var endsWithDigit = false; const digits = [];
var digits = []; while (words.length > 0 && isNumber(words[words.length - 1].string)) {
while (isDigit(lineText.charCodeAt(lineText.length - 1))) { const lastWord = words.pop();
digits.unshift(lineText.charAt(lineText.length - 1)); digits.unshift(lastWord.string);
lineText = lineText.substring(0, lineText.length - 1);
endsWithDigit = true;
} }
lineText = lineText.trim();
if (digits.length == 0 && words.length > 0) {
const lastWord = words[words.length - 1];
while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1);
}
}
var endsWithDigit = digits.length > 0;
if (endsWithDigit) { if (endsWithDigit) {
endsWithDigit = true; endsWithDigit = true;
if (lastLineTextWithoutNumber) { // 2-line item ? if (lastWordsWithoutNumber) { // 2-line item ?
lineText = lastLineTextWithoutNumber + ' ' + lineText; words.push(...lastWordsWithoutNumber);
lastLineTextWithoutNumber = null; lastWordsWithoutNumber = null;
} }
pageTocLinks.push(new TocLink({ pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')), pageNumber: parseInt(digits.join('')),
textItem: new TextItem({ lineItem: new LineItem({
...line, ...line,
text: lineText words: words
}) })
})); }));
lineItemsWithDigits.push(new TextItem({ lineItemsWithDigits++;
...line,
text: lineText
}));
lastLineTextWithoutNumber = null;
} else { } else {
if (!headlineItem) { if (!headlineItem) {
headlineItem = line; headlineItem = line;
} else { } else {
if (lastLineTextWithoutNumber) { if (lastWordsWithoutNumber) {
unknownLines.add(lastLine); unknownLines.add(lastLine);
} }
lastLineTextWithoutNumber = lineText; lastWordsWithoutNumber = words;
lastLine = line; lastLine = line;
} }
} }
}); });
// page has been processed // page has been processed
if (lineItemsWithDigits.length * 100 / page.items.length > 75) { if (lineItemsWithDigits * 100 / page.items.length > 75) {
tocPages.push(page.index + 1); tocPages.push(page.index + 1);
lastTocPage = page; lastTocPage = page;
linkLeveler.levelPageItems(pageTocLinks); linkLeveler.levelPageItems(pageTocLinks);
tocLinks = tocLinks.concat(pageTocLinks); tocLinks.push(...pageTocLinks);
const newBlocks = []; const newBlocks = [];
page.items.forEach((line) => { page.items.forEach((line) => {
@ -83,7 +88,7 @@ export default class DetectTOC extends ToTextItemTransformation {
} }
newBlocks.push(line); newBlocks.push(line);
if (line === headlineItem) { if (line === headlineItem) {
newBlocks.push(new TextItem({ newBlocks.push(new LineItem({
...line, ...line,
type: ElementType.H2, type: ElementType.H2,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
@ -105,8 +110,10 @@ export default class DetectTOC extends ToTextItemTransformation {
if (tocPages.length > 0) { if (tocPages.length > 0) {
// Add TOC items // Add TOC items
tocLinks.forEach(tocLink => { tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({ lastTocPage.items.push(new LineItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, words: [new Word({
string: ' '.repeat(tocLink.level * 3) + '-'
})].concat(tocLink.lineItem.words),
type: ElementType.TOC, type: ElementType.TOC,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
})); }));
@ -118,11 +125,11 @@ export default class DetectTOC extends ToTextItemTransformation {
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping]; var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
var foundHealineItems; var foundHealineItems;
if (linkedPage) { if (linkedPage) {
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text); foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
if (!foundHealineItems) { // pages are off by 1 ? if (!foundHealineItems) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1]; linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
if (linkedPage) { if (linkedPage) {
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text); foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
} }
} }
} }
@ -142,11 +149,16 @@ export default class DetectTOC extends ToTextItemTransformation {
const headlineType = headlineByLevel(notFoundTocLink.level + 2); const headlineType = headlineByLevel(notFoundTocLink.level + 2);
const heightRange = headlineTypeToHeightRange[headlineType.name]; const heightRange = headlineTypeToHeightRange[headlineType.name];
if (heightRange) { if (heightRange) {
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
if (textItem) { if (lineIndex > -1) {
textItem.type = headlineType; const page = parseResult.pages[pageIndex];
textItem.annotation = DETECTED_ANNOTATION; page.items[lineIndex].annotation = REMOVED_ANNOTATION;
foundBySize.push(textItem.text); page.items.splice(lineIndex + 1, 0, new LineItem({
...notFoundTocLink.lineItem,
type: headlineType,
annotation: ADDED_ANNOTATION,
}));
foundBySize.push(notFoundTocLink);
} }
} }
}); });
@ -173,12 +185,12 @@ export default class DetectTOC extends ToTextItemTransformation {
const messages = []; const messages = [];
messages.push('Detected ' + tocPages.length + ' table of content pages'); messages.push('Detected ' + tocPages.length + ' table of content pages');
if (tocPages.length > 0) { if (tocPages.length > 0) {
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange)); messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
} }
if (notFoundHeadlines.length > 0) { if (notFoundHeadlines.length > 0) {
messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber)); messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()));
messages.push('Found TOC headlines (by size): ' + foundBySize); messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber));
} }
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
@ -196,7 +208,7 @@ export default class DetectTOC extends ToTextItemTransformation {
//Find out how the TOC page link actualy translates to the page.index //Find out how the TOC page link actualy translates to the page.index
function detectPageMappingNumber(pages, tocLinks) { function detectPageMappingNumber(pages, tocLinks) {
for ( var tocLink of tocLinks ) { for ( var tocLink of tocLinks ) {
const page = findPageWithHeadline(pages, tocLink.textItem.text); const page = findPageWithHeadline(pages, tocLink.lineItem.text());
if (page) { if (page) {
return page.index - tocLink.pageNumber; return page.index - tocLink.pageNumber;
} }
@ -235,9 +247,9 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
const headlineType = headlineByLevel(tocLink.level + 2); const headlineType = headlineByLevel(tocLink.level + 2);
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0); const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({ page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
...foundItems.headlineItems[0], ...foundItems.headlineItems[0],
text: tocLink.textItem.text, words: tocLink.lineItem.words,
height: headlineHeight, height: headlineHeight,
type: headlineType, type: headlineType,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
@ -255,21 +267,22 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
} }
} }
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) { function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) {
const linkText = tocLink.lineItem.text().toUpperCase();
for (var i = fromPage; i <= toPage; i++) { for (var i = fromPage; i <= toPage; i++) {
const page = pages[i - 1]; const page = pages[i - 1];
for ( var line of page.items ) { const lineIndex = page.items.findIndex(line => {
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) { if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
const match = wordMatch(tocLink.textItem.text, line.text); const match = wordMatch(linkText, line.text());
if (match >= 0.5) { return match >= 0.5;
return line;
}
} }
} return false;
});
if (lineIndex > -1) return [i - 1, lineIndex];
} }
return [-1, -1];
} }
class LinkLeveler { class LinkLeveler {
constructor() { constructor() {
this.levelByMethod = null; this.levelByMethod = null;
@ -297,13 +310,13 @@ class LinkLeveler {
levelByXDiff(tocLinks) { levelByXDiff(tocLinks) {
const uniqueX = this.calculateUniqueX(tocLinks); const uniqueX = this.calculateUniqueX(tocLinks);
tocLinks.forEach(link => { tocLinks.forEach(link => {
link.level = uniqueX.indexOf(link.textItem.x); link.level = uniqueX.indexOf(link.lineItem.x);
}); });
} }
levelByFont(tocLinks) { levelByFont(tocLinks) {
tocLinks.forEach(link => { tocLinks.forEach(link => {
link.level = this.uniqueFonts.indexOf(link.textItem.font); link.level = this.uniqueFonts.indexOf(link.lineItem.font);
}); });
} }
@ -315,7 +328,7 @@ class LinkLeveler {
calculateUniqueX(tocLinks) { calculateUniqueX(tocLinks) {
var uniqueX = tocLinks.reduce(function(uniquesArray, link) { var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x); if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x);
return uniquesArray; return uniquesArray;
}, []); }, []);
@ -328,7 +341,7 @@ class LinkLeveler {
calculateUniqueFonts(tocLinks) { calculateUniqueFonts(tocLinks) {
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) { var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font); if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font);
return uniquesArray; return uniquesArray;
}, []); }, []);
@ -339,7 +352,7 @@ class LinkLeveler {
class TocLink { class TocLink {
constructor(options) { constructor(options) {
this.textItem = options.textItem; this.lineItem = options.lineItem;
this.pageNumber = options.pageNumber; this.pageNumber = options.pageNumber;
this.level = 0; this.level = 0;
} }

View File

@ -1,4 +1,4 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc... // Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
export default class RemoveRepetitiveElements extends ToTextItemTransformation { export default class RemoveRepetitiveElements extends ToLineItemTransformation {
constructor() { constructor() {
super("Remove Repetitive Elements"); super("Remove Repetitive Elements");
@ -58,8 +58,8 @@ export default class RemoveRepetitiveElements extends ToTextItemTransformation {
maxElements: [] maxElements: []
}); });
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), '')); const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), '')); const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
pageStore.push({ pageStore.push({
minElements: minMaxItems.minElements, minElements: minMaxItems.minElements,
maxElements: minMaxItems.maxElements, maxElements: minMaxItems.maxElements,

View File

@ -1,10 +1,11 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx'; import LineItem from '../../LineItem.jsx';
import StashingStream from '../../StashingStream.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// Converts vertical text to horizontal // Converts vertical text to horizontal
export default class VerticalToHorizontal extends ToTextItemTransformation { export default class VerticalToHorizontal extends ToLineItemTransformation {
constructor() { constructor() {
super("Vertical to Horizontal Text"); super("Vertical to Horizontal Text");
@ -12,87 +13,64 @@ export default class VerticalToHorizontal extends ToTextItemTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
var foundVerticals = 0; var foundVerticals = 0;
const newPages = parseResult.pages.map(page => { parseResult.pages.forEach(page => {
const newTextItems = []; const stream = new VerticalsStream();
// var oneCharacterItems = []; stream.consumeAll(page.items);
page.items = stream.complete();
// const applyTransformation = () => { foundVerticals += stream.foundVerticals;
// oneCharacterItems.forEach(item => {
// item.annotation = REMOVED_ANNOTATION;
// newTextItems.push(item);
// //TODO add new
// });
// oneCharacterItems = [];
// };
// const rollbackTransformation = () => {
// oneCharacterItems.forEach(item => {
// newTextItems.push(item);
// });
// oneCharacterItems = [];
// };
//TODO generic state machine code ?
const leftOver = page.items.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
} else {
const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
oneCharacterItems.push(item);
} else {
if (oneCharacterItems.length > 5) {
var combinedText = '';
var minX = 999;
var maxY = 0;
var sumWidth = 0;
var maxHeight = 0;
oneCharacterItems.forEach(oneCharacterItem => {
oneCharacterItem.annotation = REMOVED_ANNOTATION;
newTextItems.push(oneCharacterItem);
combinedText += oneCharacterItem.text.trim();
minX = Math.min(minX, oneCharacterItem.x);
maxY = Math.max(maxY, oneCharacterItem.y);
sumWidth += oneCharacterItem.width;
maxHeight = Math.max(maxHeight, oneCharacterItem.height);
});
newTextItems.push(new TextItem({
...oneCharacterItems[0],
x: minX,
y: maxY,
width: sumWidth,
height: maxHeight,
text: combinedText,
annotation: ADDED_ANNOTATION
}));
foundVerticals++;
} else {
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
}
oneCharacterItems = [item];
}
}
} else {
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
oneCharacterItems = [];
newTextItems.push(item);
}
return oneCharacterItems;
}, []);
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
return {
...page,
items: newTextItems
};
}); });
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
pages: newPages,
messages: ["Converted " + foundVerticals + " verticals"] messages: ["Converted " + foundVerticals + " verticals"]
}); });
} }
}
class VerticalsStream extends StashingStream {
constructor() {
super();
this.foundVerticals = 0;
}
shouldStash(item) {
return item.words.length == 1 && item.words[0].string.length == 1;
}
doMatchesStash(lastItem, item) {
return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type;
}
doFlushStash(stash, results) {
if (stash.length > 5) { // unite
var combinedWords = [];
var minX = 999;
var maxY = 0;
var sumWidth = 0;
var maxHeight = 0;
stash.forEach(oneCharacterLine => {
oneCharacterLine.annotation = REMOVED_ANNOTATION;
results.push(oneCharacterLine);
combinedWords.push(oneCharacterLine.words[0]);
minX = Math.min(minX, oneCharacterLine.x);
maxY = Math.max(maxY, oneCharacterLine.y);
sumWidth += oneCharacterLine.width;
maxHeight = Math.max(maxHeight, oneCharacterLine.height);
});
results.push(new LineItem({
...stash[0],
x: minX,
y: maxY,
width: sumWidth,
height: maxHeight,
words: combinedWords,
annotation: ADDED_ANNOTATION
}));
this.foundVerticals++;
} else { //add as singles
results.push(...stash);
}
}
} }

View File

@ -1,11 +1,11 @@
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx'; import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { minXFromBlocks } from '../../../textItemFunctions.jsx'; import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
//Detect items which are code/quote blocks //Detect items which are code/quote blocks
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation { export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation {
constructor() { constructor() {
super("Detect Code/Quote Blocks"); super("Detect Code/Quote Blocks");
@ -17,7 +17,7 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items); var minX = minXFromBlocks(page.items);
page.items.forEach(block => { page.items.forEach(block => {
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) { if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
block.annotation = DETECTED_ANNOTATION; block.annotation = DETECTED_ANNOTATION;
block.type = ElementType.CODE; block.type = ElementType.CODE;
foundCodeItems++; foundCodeItems++;
@ -36,14 +36,14 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
} }
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) { function looksLikeCodeBlock(minX, items, mostUsedHeight) {
if (textItems.length == 0) { if (items.length == 0) {
return false; return false;
} }
if (textItems.length == 1) { if (items.length == 1) {
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1; return items[0].x > minX && items[0].height <= mostUsedHeight + 1;
} }
for ( var item of textItems ) { for ( var item of items ) {
if (item.x == minX) { if (item.x == minX) {
return false; return false;
} }

View File

@ -1,10 +1,11 @@
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx'; import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import Word from '../../Word.jsx';
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx'; import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
// Cares for proper sub-item spacing/leveling // Cares for proper sub-item spacing/leveling
export default class DetectListLevels extends ToTextItemBlockTransformation { export default class DetectListLevels extends ToLineItemBlockTransformation {
constructor() { constructor() {
super("Level Lists"); super("Level Lists");
@ -21,23 +22,25 @@ export default class DetectListLevels extends ToTextItemBlockTransformation {
var currentLevel = 0; var currentLevel = 0;
const xByLevel = {}; const xByLevel = {};
var modifiedBlock = false; var modifiedBlock = false;
listBlock.textItems.forEach(textItem => { listBlock.items.forEach(item => {
const isListItem = true; const isListItem = true;
if (lastItemX && isListItem) { if (lastItemX && isListItem) {
if (textItem.x > lastItemX) { if (item.x > lastItemX) {
currentLevel++; currentLevel++;
xByLevel[textItem.x] = currentLevel; xByLevel[item.x] = currentLevel;
} else if (textItem.x < lastItemX) { } else if (item.x < lastItemX) {
currentLevel = xByLevel[textItem.x]; currentLevel = xByLevel[item.x];
} }
} else { } else {
xByLevel[textItem.x] = 0; xByLevel[item.x] = 0;
} }
if (currentLevel > 0) { if (currentLevel > 0) {
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text; item.words = [new Word({
string: ' '.repeat(currentLevel * 3)
})].concat(item.words);
modifiedBlock = true; modifiedBlock = true;
} }
lastItemX = textItem.x; lastItemX = item.x;
}); });
listBlocks++; listBlocks++;
if (modifiedBlock) { if (modifiedBlock) {

View File

@ -1,11 +1,11 @@
import ToTextItemBlockTransformation from '../ToTextItemBlockTransformation.jsx'; import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import TextItemBlock from '../../TextItemBlock.jsx'; import LineItemBlock from '../../LineItemBlock.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import { minXFromTextItems } from '../../../textItemFunctions.jsx'; import { minXFromPageItems } from '../../../pageItemFunctions.jsx';
// Gathers lines to blocks // Gathers lines to blocks
export default class GatherBlocks extends ToTextItemBlockTransformation { export default class GatherBlocks extends ToLineItemBlockTransformation {
constructor() { constructor() {
super("Gather Blocks"); super("Gather Blocks");
@ -14,29 +14,29 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals; const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0; var createdBlocks = 0;
var textItems = 0; var lineItemCount = 0;
parseResult.pages.map(page => { parseResult.pages.map(page => {
textItems += page.items.length; lineItemCount += page.items.length;
const blocks = []; const blocks = [];
var stashedBlock = new TextItemBlock({}); var stashedBlock = new LineItemBlock({});
const flushStashedItems = () => { const flushStashedItems = () => {
if (stashedBlock.textItems.length > 1) { if (stashedBlock.items.length > 1) {
stashedBlock.annotation = DETECTED_ANNOTATION; stashedBlock.annotation = DETECTED_ANNOTATION;
} }
blocks.push(stashedBlock); blocks.push(stashedBlock);
stashedBlock = new TextItemBlock({}); stashedBlock = new LineItemBlock({});
createdBlocks++; createdBlocks++;
}; };
var minX = minXFromTextItems(page.items); var minX = minXFromPageItems(page.items);
page.items.forEach(item => { page.items.forEach(item => {
if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) { if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
flushStashedItems(); flushStashedItems();
} }
stashedBlock.addTextItem(item); stashedBlock.addItem(item);
}); });
if (stashedBlock.textItems.length > 0) { if (stashedBlock.items.length > 0) {
flushStashedItems(); flushStashedItems();
} }
page.items = blocks; page.items = blocks;
@ -44,7 +44,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items'] messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items']
}); });
} }
@ -54,7 +54,7 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) { if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
return false; return false;
} }
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1]; const lastItem = stashedBlock.items[stashedBlock.items.length - 1];
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance); const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) { if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
return false; return false;

View File

@ -1,10 +1,10 @@
import TextItemBlock from './models/TextItemBlock.jsx'; import PageItem from './models/PageItem.jsx';
import TextItem from './models/TextItem.jsx'; import LineItemBlock from './models/LineItemBlock.jsx';
export function minXFromBlocks(blocks:TextItemBlock[]) { export function minXFromBlocks(blocks:LineItemBlock[]) {
var minX = 999; var minX = 999;
blocks.forEach(block => { blocks.forEach(block => {
block.textItems.forEach(item => { block.items.forEach(item => {
minX = Math.min(minX, item.x) minX = Math.min(minX, item.x)
}); });
}); });
@ -14,7 +14,7 @@ export function minXFromBlocks(blocks:TextItemBlock[]) {
return minX; return minX;
} }
export function minXFromTextItems(items:TextItem) { export function minXFromPageItems(items:PageItem) {
var minX = 999; var minX = 999;
items.forEach(item => { items.forEach(item => {
minX = Math.min(minX, item.x) minX = Math.min(minX, item.x)
@ -25,13 +25,13 @@ export function minXFromTextItems(items:TextItem) {
return minX; return minX;
} }
export function sortByX(items:TextItem) { export function sortByX(items:PageItem) {
items.sort((a, b) => { items.sort((a, b) => {
return a.x - b.x; return a.x - b.x;
}); });
} }
export function sortCopyByX(items:TextItem) { export function sortCopyByX(items:PageItem) {
const copy = items.concat(); const copy = items.concat();
sortByX(copy); sortByX(copy);
return copy; return copy;

View File

@ -1,31 +1,30 @@
import { expect } from 'chai'; import { expect } from 'chai';
import HeadlineFinder from '../src/javascript/models/HeadlineFinder'; import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
import TextItem from '../src/javascript/models/TextItem.jsx'; import LineItem from '../src/javascript/models/LineItem.jsx';
describe('HeadlineFinder', () => { describe('HeadlineFinder', () => {
it('Not Found - Case 1', () => { it('Not Found - Case 1', () => {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline' headline: 'My Little Headline'
}); });
const item1 = new TextItem({ const item1 = new LineItem({
text: 'My ' text: 'My '
}); });
const item2 = new TextItem({ const item2 = new LineItem({
text: 'Little' text: 'Little'
}); });
const item3 = new TextItem({ const item3 = new LineItem({
text: ' Headline2' text: ' Headline2'
}); });
expect(headlineFinder.consume(item1)).to.equal(null); expect(headlineFinder.consume(item1)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null); expect(headlineFinder.consume(item2)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.equal(null); expect(headlineFinder.consume(item3)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0); expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
}); });
@ -33,22 +32,22 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline' headline: 'My Little Headline'
}); });
const item1 = new TextItem({ const item1 = new LineItem({
text: 'My ' text: 'My '
}); });
const item2 = new TextItem({ const item2 = new LineItem({
text: 'Little' text: 'Little'
}); });
const item3 = new TextItem({ const item3 = new LineItem({
text: ' Headline' text: ' Headline'
}); });
expect(headlineFinder.consume(item1)).to.equal(null); expect(headlineFinder.consume(item1)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null); expect(headlineFinder.consume(item2)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
}); });
@ -56,27 +55,27 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline' headline: 'My Little Headline'
}); });
const item0 = new TextItem({ const item0 = new LineItem({
text: 'Waste ' text: 'Waste '
}); });
const item1 = new TextItem({ const item1 = new LineItem({
text: 'My ' text: 'My '
}); });
const item2 = new TextItem({ const item2 = new LineItem({
text: 'Little' text: 'Little'
}); });
const item3 = new TextItem({ const item3 = new LineItem({
text: ' Headline' text: ' Headline'
}); });
expect(headlineFinder.consume(item0)).to.equal(null); expect(headlineFinder.consume(item0)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0); expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
expect(headlineFinder.consume(item1)).to.equal(null); expect(headlineFinder.consume(item1)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null); expect(headlineFinder.consume(item2)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
}); });
@ -84,27 +83,27 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline' headline: 'My Little Headline'
}); });
const item0 = new TextItem({ const item0 = new LineItem({
text: 'My ' text: 'My '
}); });
const item1 = new TextItem({ const item1 = new LineItem({
text: 'My ' text: 'My '
}); });
const item2 = new TextItem({ const item2 = new LineItem({
text: 'Little' text: 'Little'
}); });
const item3 = new TextItem({ const item3 = new LineItem({
text: ' Headline' text: ' Headline'
}); });
expect(headlineFinder.consume(item0)).to.equal(null); expect(headlineFinder.consume(item0)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item0);
expect(headlineFinder.consume(item1)).to.equal(null); expect(headlineFinder.consume(item1)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null); expect(headlineFinder.consume(item2)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
}); });
@ -112,22 +111,22 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: 'MYLitt le HEADline' headline: 'MYLitt le HEADline'
}); });
const item1 = new TextItem({ const item1 = new LineItem({
text: 'My ' text: 'My '
}); });
const item2 = new TextItem({ const item2 = new LineItem({
text: 'Little' text: 'Little'
}); });
const item3 = new TextItem({ const item3 = new LineItem({
text: ' Headline' text: ' Headline'
}); });
expect(headlineFinder.consume(item1)).to.equal(null); expect(headlineFinder.consume(item1)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1); expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null); expect(headlineFinder.consume(item2)).to.equal(null);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2); expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3); expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
}); });

View File

@ -2,9 +2,10 @@ import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx' import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
describe('hasUpperCaseCharacterInMiddleOfWord', () => { describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
it('single word', () => { it('single word', () => {
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false); expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false); expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
@ -38,7 +39,7 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
}); });
}); });
describe('removeLeadingWhitespaces', () => { describe('functions: removeLeadingWhitespaces', () => {
it('No Removes', () => { it('No Removes', () => {
expect(removeLeadingWhitespaces(".")).to.be.equal("."); expect(removeLeadingWhitespaces(".")).to.be.equal(".");
expect(removeLeadingWhitespaces(". ")).to.be.equal(". "); expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
@ -54,7 +55,7 @@ describe('removeLeadingWhitespaces', () => {
}); });
describe('removeTrailingWhitespaces', () => { describe('functions: removeTrailingWhitespaces', () => {
it('No Removes', () => { it('No Removes', () => {
expect(removeTrailingWhitespaces(".")).to.be.equal("."); expect(removeTrailingWhitespaces(".")).to.be.equal(".");
expect(removeTrailingWhitespaces(" .")).to.be.equal(" ."); expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
@ -71,7 +72,7 @@ describe('removeTrailingWhitespaces', () => {
}); });
describe('prefixAfterWhitespace', () => { describe('functions: prefixAfterWhitespace', () => {
it('Basic', () => { it('Basic', () => {
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12'); expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12'); expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
@ -81,7 +82,7 @@ describe('prefixAfterWhitespace', () => {
}); });
}); });
describe('suffixBeforeWhitespace', () => { describe('functions: suffixBeforeWhitespace', () => {
it('Basic', () => { it('Basic', () => {
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. '); expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.'); expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
@ -92,7 +93,7 @@ describe('suffixBeforeWhitespace', () => {
}); });
describe('charCodeArray', () => { describe('functions: charCodeArray', () => {
it('Charcodes', () => { it('Charcodes', () => {
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46); expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
}); });
@ -105,7 +106,7 @@ describe('charCodeArray', () => {
}); });
describe('normalizedCharCodeArray', () => { describe('functions: normalizedCharCodeArray', () => {
it('No Change', () => { it('No Change', () => {
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD"); expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
@ -131,7 +132,7 @@ describe('normalizedCharCodeArray', () => {
}); });
describe('isListItem', () => { describe('functions: isListItem', () => {
it('Match', () => { it('Match', () => {
expect(isListItem('- my text')).to.equal(true); expect(isListItem('- my text')).to.equal(true);
@ -154,7 +155,7 @@ describe('isListItem', () => {
}); });
describe('isNumberedListItem', () => { describe('functions: isNumberedListItem', () => {
it('Match', () => { it('Match', () => {
expect(isNumberedListItem('1. my text')).to.equal(true); expect(isNumberedListItem('1. my text')).to.equal(true);
@ -173,7 +174,7 @@ describe('isNumberedListItem', () => {
}); });
describe('wordsMatch', () => { describe('functions: wordsMatch', () => {
it('Match', () => { it('Match', () => {
expect(wordMatch('text 1', 'text 1')).to.equal(1.0); expect(wordMatch('text 1', 'text 1')).to.equal(1.0);

View File

@ -0,0 +1,64 @@
import { expect } from 'chai';
import StashingStream from '../../src/javascript/models/StashingStream';
import TextItem from '../../src/javascript/models/TextItem.jsx';
describe('StashingStream', () => {
it('Simple', () => {
const stream = new MyStashingStream();
stream.consume('a');
stream.consume('b');
stream.consume('a');
stream.consume('a');
stream.consume('z');
stream.consume('m');
stream.consume('m');
stream.consume('z');
stream.consume('z');
stream.consume('c');
stream.consume('e');
stream.consume('f');
stream.consume('m');
stream.consume('a');
const resultsAsString = stream.complete().join('');
expect(resultsAsString).to.equal('AbAAZZZcefA');
expect(stream.transformedItems).to.equal(10);
});
it('ConsumeAll', () => {
const items = ['k', 'k', 'x', 'a', 'm', 'z', 'o', 'p']
const stream = new MyStashingStream();
stream.consumeAll(items);
const resultsAsString = stream.complete().join('');
expect(resultsAsString).to.equal('kkxAZop');
expect(stream.transformedItems).to.equal(3);
});
});
class MyStashingStream extends StashingStream {
constructor() {
super();
this.transformedItems = 0;
}
shouldStash(item) {
return item === 'a' || item === 'z' || item === 'm';
}
doMatchesStash(lastItem, item) {
return lastItem === item;
}
doFlushStash(stash, results) {
this.transformedItems += stash.length;
results.push(...stash.filter(elem => elem !== 'm').map(item => item.toUpperCase()));
}
}