diff --git a/package.json b/package.json
index 7d87c49..ba460bb 100644
--- a/package.json
+++ b/package.json
@@ -7,7 +7,7 @@
"watch": "webpack -d --watch",
"build": "webpack",
"lint": "eslint src --ext .js --ext .jsx --cache",
- "test": "mocha --compilers js:babel-core/register test/*.spec.js",
+ "test": "mocha --compilers js:babel-core/register test --recursive",
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
"deploy": "npm run release && cp -r build/* docs/"
},
diff --git a/src/javascript/components/debug/TextItemBlockPageView.jsx b/src/javascript/components/debug/LineItemBlockPageView.jsx
similarity index 84%
rename from src/javascript/components/debug/TextItemBlockPageView.jsx
rename to src/javascript/components/debug/LineItemBlockPageView.jsx
index 261c16f..bd51ed7 100644
--- a/src/javascript/components/debug/TextItemBlockPageView.jsx
+++ b/src/javascript/components/debug/LineItemBlockPageView.jsx
@@ -1,13 +1,12 @@
import React from 'react';
import PageView from './PageView.jsx';
-import TextItemTable from './TextItemTable.jsx';
+import LineItemTable from './LineItemTable.jsx';
-// View for a Page which items are of kind TextItemBlock
-export default class TextItemBlockPageView extends PageView {
+// View for a Page which items are of kind LineItemBlock
+export default class LineItemBlockPageView extends PageView {
createItemViews(items, showWhitespaces) {
const blockTables = items.map((block, i) => {
- var textItems = block.textItems;
const blockType = block.type ? ' - ' + block.type.name : null;
const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category }
: null;
@@ -38,7 +37,7 @@ export default class TextItemBlockPageView extends PageView {
Block { i + 1 }{ blockType } { blockAnnotation }
-
+
{ footnoteLinks }
{ footnotes }
diff --git a/src/javascript/components/debug/LineItemPageView.jsx b/src/javascript/components/debug/LineItemPageView.jsx
new file mode 100644
index 0000000..a54abae
--- /dev/null
+++ b/src/javascript/components/debug/LineItemPageView.jsx
@@ -0,0 +1,12 @@
+import React from 'react';
+import PageView from './PageView.jsx';
+import LineItemTable from './LineItemTable.jsx';
+
+// View for a Page which items are of kind LineItem
+export default class LineItemPageView extends PageView {
+
+ createItemViews(items, showWhitespaces) {
+ return
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/components/debug/LineItemTable.jsx b/src/javascript/components/debug/LineItemTable.jsx
new file mode 100644
index 0000000..27eb0eb
--- /dev/null
+++ b/src/javascript/components/debug/LineItemTable.jsx
@@ -0,0 +1,108 @@
+import React from 'react';
+
+import Table from 'react-bootstrap/lib/Table'
+
+// Displays an array of LineItem as a table
+export default class LineItemTable extends React.Component {
+
+ static propTypes = {
+ items: React.PropTypes.array.isRequired,
+ showWhitespaces: React.PropTypes.bool
+ };
+
+ render() {
+ const {showWhitespaces, items} = this.props;
+ const tableHeader =
+
+
+ #
+ |
+
+ Text
+ |
+
+ X
+ |
+
+ Y
+ |
+
+ Width
+ |
+
+ Height
+ |
+
+
+
+ const itemRows = items.map((item, i) =>
+
+
+ { i }
+
+
+ { item.annotation ? item.annotation.category : '' }
+
+
+ { item.type ? item.type.name : '' }
+
+
+ { item.parsedElements && item.parsedElements.footnoteLinks.length > 0 ?
+ Footnote-Link
+ : '' }
+ { item.parsedElements && item.parsedElements.containLinks ?
+ Link
+ : '' }
+ { item.lineFormat ?
+ { item.lineFormat.name }
+ : '' }
+ { item.unopenedFormat ?
+ Unopened
+ { ' ' + item.unopenedFormat.name }
+ : '' }
+ { item.parsedElements && item.parsedElements.inlineFormats > 0 ?
+ { item.parsedElements.inlineFormats + 'x Bold/Italic' }
+ : '' }
+ { item.unclosedFormat ?
+ Unclosed
+ { ' ' + item.unclosedFormat.name }
+ : '' }
+
+ |
+
+ { showWhitespaces ? (
+ { item.text() }
+ ) : (item.text()) }
+ |
+
+ { item.x }
+ |
+
+ { item.y }
+ |
+
+ { item.width }
+ |
+
+ { item.height }
+ |
+
+ )
+
+ return (
+
+ { tableHeader }
+
+ { itemRows }
+
+
+ );
+ }
+}
\ No newline at end of file
diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx
index 53b6a09..1680a69 100644
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@@ -18,6 +18,17 @@ export function isNumber(string) {
return true;
}
+export function hasOnly(string, char) {
+ const charCode = char.charCodeAt(0);
+ for (var i = 0; i < string.length; i++) {
+ const aCharCode = string.charCodeAt(i);
+ if (aCharCode != charCode) {
+ return false;
+ }
+ }
+ return true;
+}
+
export function hasUpperCaseCharacterInMiddleOfWord(text) {
var beginningOfWord = true;
for (var i = 0; i < text.length; i++) {
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index 7ecdb4d..14e7be4 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -1,6 +1,7 @@
import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
+
import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
@@ -56,10 +57,10 @@ export default class AppState {
new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
- new PostprocessLines(),
+ // new PostprocessLines(),
new DetectTOC(),
new DetectHeaders(),
- new CompleteFormats(),
+ // new CompleteFormats(),
new DetectListItems(),
new GatherBlocks(),
diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx
index 3817708..6463a04 100644
--- a/src/javascript/models/ElementType.jsx
+++ b/src/javascript/models/ElementType.jsx
@@ -1,83 +1,85 @@
import { Enum } from 'enumify';
-import TextItem from './TextItem.jsx';
-import TextItemBlock from './TextItemBlock.jsx';
+import LineItem from './LineItem.jsx';
+import LineItemBlock from './LineItemBlock.jsx';
// An Markdown element
export default class ElementType extends Enum {
}
+//TODO rename to BlockType
+
ElementType.initEnum({
H1: {
headline: true,
headlineLevel: 1,
- toText(block:TextItemBlock) {
- return '# ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '# ' + concatLineItems(block.items);
}
},
H2: {
headline: true,
headlineLevel: 2,
- toText(block:TextItemBlock) {
- return '## ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '## ' + concatLineItems(block.items);
}
},
H3: {
headline: true,
headlineLevel: 3,
- toText(block:TextItemBlock) {
- return '### ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '### ' + concatLineItems(block.items);
}
},
H4: {
headline: true,
headlineLevel: 4,
- toText(block:TextItemBlock) {
- return '#### ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '#### ' + concatLineItems(block.items);
}
},
H5: {
headline: true,
headlineLevel: 5,
- toText(block:TextItemBlock) {
- return '##### ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '##### ' + concatLineItems(block.items);
}
},
H6: {
headline: true,
headlineLevel: 6,
- toText(block:TextItemBlock) {
- return '###### ' + concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return '###### ' + concatLineItems(block.items);
}
},
TOC: {
mergeToBlock: true,
- toText(block:TextItemBlock) {
- return concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return concatLineItems(block.items);
}
},
FOOTNOTES: {
mergeToBlock: true,
mergeFollowingNonTypedItems: true,
- toText(block:TextItemBlock) {
- return concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return concatLineItems(block.items);
}
},
CODE: {
mergeToBlock: true,
- toText(block:TextItemBlock) {
- return '```\n' + concatTextItems(block.textItems) + '```'
+ toText(block:LineItemBlock) {
+ return '```\n' + concatLineItems(block.items) + '```'
}
},
LIST: {
mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true,
- toText(block:TextItemBlock) {
- return concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return concatLineItems(block.items);
}
},
PARAGRAPH: {
- toText(block:TextItemBlock) {
- return concatTextItems(block.textItems);
+ toText(block:LineItemBlock) {
+ return concatLineItems(block.items);
}
}
});
@@ -86,17 +88,17 @@ export function isHeadline(elementType: ElementType) {
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
}
-export function blockToText(block: TextItemBlock) {
+export function blockToText(block: LineItemBlock) {
if (!block.type) {
- return concatTextItems(block.textItems);
+ return concatLineItems(block.items);
}
return block.type.toText(block);
}
-function concatTextItems(textItems: TextItem[]) {
+function concatLineItems(lineItems: LineItem[]) {
var text = '';
- textItems.forEach(item => {
- text += item.text + '\n';
+ lineItems.forEach(item => {
+ text += item.text() + '\n';
});
return text;
}
diff --git a/src/javascript/models/HeadlineFinder.jsx b/src/javascript/models/HeadlineFinder.jsx
index 4e7fe62..4061400 100644
--- a/src/javascript/models/HeadlineFinder.jsx
+++ b/src/javascript/models/HeadlineFinder.jsx
@@ -4,24 +4,25 @@ export default class HeadlineFinder {
constructor(options) {
this.headlineCharCodes = normalizedCharCodeArray(options.headline);
- this.stackedTextItems = [];
+ this.stackedLineItems = [];
this.stackedChars = 0;
}
- consume(textItem) {
- const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
+ consume(lineItem) {
+ //TODO avoid join
+ const normalizedCharCodes = normalizedCharCodeArray(lineItem.text());
const matchAll = this.matchAll(normalizedCharCodes);
if (matchAll) {
- this.stackedTextItems.push(textItem);
+ this.stackedLineItems.push(lineItem);
this.stackedChars += normalizedCharCodes.length;
if (this.stackedChars == this.headlineCharCodes.length) {
- return this.stackedTextItems;
+ return this.stackedLineItems;
}
} else {
if (this.stackedChars > 0) {
this.stackedChars = 0;
- this.stackedTextItems = [];
- this.consume(textItem); // test again without stack
+ this.stackedLineItems = [];
+ this.consume(lineItem); // test again without stack
}
}
return null;
diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx
new file mode 100644
index 0000000..5ea47ee
--- /dev/null
+++ b/src/javascript/models/LineConverter.jsx
@@ -0,0 +1,145 @@
+import TextItem from './TextItem.jsx';
+import Word from './Word.jsx';
+import WordType from './markdown/WordType.jsx';
+import LineItem from './LineItem.jsx';
+import StashingStream from './StashingStream.jsx';
+import { ParsedElements } from './PageItem.jsx';
+import { isNumber } from '../functions.jsx'
+import { sortByX } from '../pageItemFunctions.jsx'
+
+// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
+//'whitespace removal', bold/emphasis annotation, link-detection, etc..
+export default class LineConverter {
+
+ constructor(fontToFormats) {
+ this.fontToFormats = fontToFormats;
+ }
+
+ // returns a CombineResult
+ compact(textItems: TextItem[]) {
+ // we can't trust order of occurence, esp. footnoteLinks like to come last
+ sortByX(textItems);
+
+ const wordStream = new WordDetectionStream(this.fontToFormats);
+ wordStream.consumeAll(textItems.map(item => new TextItem({
+ ...item
+ })));
+ const words = wordStream.complete();
+
+ var maxHeight = 0;
+ var widthSum = 0;
+ textItems.forEach(item => {
+ maxHeight = Math.max(maxHeight, item.height);
+ widthSum += item.width;
+ });
+ return new LineItem({
+ x: textItems[0].x,
+ y: textItems[0].y,
+ height: maxHeight,
+ width: widthSum,
+ words: words,
+ parsedElements: new ParsedElements({
+ footnoteLinks: wordStream.footnoteLinks,
+ footnotes: wordStream.footnotes
+ })
+ });
+
+ }
+
+}
+
+function itemsToWords(items, format) {
+ const combinedText = combineText(items);
+ // const combinedText = items.map(textItem => textItem.text).join('');
+ const words = combinedText.split(' ');
+ return words.filter(w => w.trim().length > 0).map(word => {
+ return new Word({
+ string: word,
+ type: format
+ });
+ });
+}
+
+function combineText(textItems) {
+ var text = '';
+ var lastItem;
+ textItems.forEach(textItem => {
+ if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
+ const xDistance = textItem.x - lastItem.x - lastItem.width;
+ if (xDistance > 5) {
+ text += ' ';
+ }
+ }
+ text += textItem.text;
+ lastItem = textItem;
+ });
+ return text;
+}
+
+class WordDetectionStream extends StashingStream {
+
+ constructor(fontToFormats) {
+ super();
+ this.fontToFormats = fontToFormats;
+ this.footnoteLinks = [];
+ this.footnotes = [];
+
+ this.firstY;
+ this.stashedNumber = false;
+ this.currentItem;
+ }
+
+ shouldStash(item) { // eslint-disable-line no-unused-vars
+ if (!this.firstY) {
+ this.firstY = item.y;
+ }
+ this.currentItem = item;
+ return true;
+ }
+
+ onPushOnStash(item) { // eslint-disable-line no-unused-vars
+ this.stashedNumber = isNumber(item.text.trim());
+ }
+
+ doMatchesStash(lastItem, item) {
+ const lastItemFormat = this.fontToFormats.get(lastItem.font);
+ const itemFormat = this.fontToFormats.get(item.font);
+ if (lastItemFormat !== itemFormat) {
+ return false;
+ }
+ const itemIsANumber = isNumber(item.text.trim());
+ return this.stashedNumber == itemIsANumber;
+ }
+
+ doFlushStash(stash, results) {
+ if (this.stashedNumber) {
+ const joinedNumber = stash.map(item => item.text).join('');
+ if (stash[0].y > this.firstY) { // footnote link
+ results.push(new Word({
+ string: `${joinedNumber}`,
+ type: WordType.FOOTNOTE_LINK
+ //TODO format to
+ //^
+ //`[${joinedNumber}](#${joinedNumber})`
+ }));
+ this.footnoteLinks.push(parseInt(joinedNumber));
+ } else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
+ results.push(new Word({
+ string: `${joinedNumber}`,
+ type: WordType.FOOTNOTE
+ //TODO format to (^${ joinedNumber}):
+ }));
+ this.footnotes.push(joinedNumber);
+ } else {
+ this.copyStashItemsAsText(stash, results);
+ }
+ } else {
+ this.copyStashItemsAsText(stash, results);
+ }
+ }
+
+ copyStashItemsAsText(stash, results) {
+ const format = this.fontToFormats.get(stash[0].font);
+ results.push(...itemsToWords(stash, format));
+ }
+}
diff --git a/src/javascript/models/LineItem.jsx b/src/javascript/models/LineItem.jsx
new file mode 100644
index 0000000..a52afcf
--- /dev/null
+++ b/src/javascript/models/LineItem.jsx
@@ -0,0 +1,29 @@
+import PageItem from './PageItem.jsx'
+import Word from './Word.jsx'
+
+//A line within a page
+export default class LineItem extends PageItem {
+
+ constructor(options) {
+ super(options);
+ this.x = options.x;
+ this.y = options.y;
+ this.width = options.width;
+ this.height = options.height;
+ this.words = options.words || [];
+ if (options.text && !options.words) {
+ this.words = options.text.split(" ").filter(string => string.trim().length > 0).map(wordAsString => new Word({
+ string: wordAsString
+ }));
+ }
+ }
+
+ text() {
+ return this.wordStrings().join(" ");
+ }
+
+ wordStrings() {
+ return this.words.map(word => word.string);
+ }
+
+}
diff --git a/src/javascript/models/LineItemBlock.jsx b/src/javascript/models/LineItemBlock.jsx
new file mode 100644
index 0000000..64f9394
--- /dev/null
+++ b/src/javascript/models/LineItemBlock.jsx
@@ -0,0 +1,36 @@
+import PageItem from './PageItem.jsx'
+import LineItem from './LineItem.jsx'
+
+// A block of LineItem[] within a Page
+export default class LineItemBlock extends PageItem {
+
+ constructor(options) {
+ super(options);
+ this.items = [];
+ if (options.items) {
+ options.items.forEach(item => this.addItem(item));
+ }
+ }
+
+ addItem(item:LineItem) {
+ if (this.type && item.type && this.type !== item.type) {
+ throw `Adding item of type ${item.type} to block of type ${this.type}`
+ }
+ if (!this.type) {
+ this.type = item.type;
+ }
+ if (item.parsedElements) {
+ if (this.parsedElements) {
+ this.parsedElements.add(item.parsedElements);
+ } else {
+ this.parsedElements = item.parsedElements;
+ }
+ }
+ const copiedItem = new LineItem({
+ ...item
+ });
+ copiedItem.type = null;
+ this.items.push(copiedItem);
+ }
+
+}
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index dbb00af..72f465d 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -1,4 +1,4 @@
-// A abstract PageItem class, can be TextItem, or TextItemBlock
+// A abstract PageItem class, can be TextItem, LineItem or LineItemBlock
export default class PageItem {
constructor(options) {
diff --git a/src/javascript/models/StashingStream.jsx b/src/javascript/models/StashingStream.jsx
new file mode 100644
index 0000000..630fb8d
--- /dev/null
+++ b/src/javascript/models/StashingStream.jsx
@@ -0,0 +1,73 @@
+//Abstract stream which allows stash items temporarily
+export default class StashingStream {
+
+ constructor() {
+ if (this.constructor === StashingStream) {
+ throw new TypeError("Can not construct abstract class.");
+ }
+ this.results = [];
+ this.stash = [];
+ }
+
+ consumeAll(items) {
+ items.forEach(item => this.consume(item));
+ }
+
+ consume(item) {
+ if (this.shouldStash(item)) {
+ if (!this.matchesStash(item)) {
+ this.flushStash();
+ }
+ this.pushOnStash(item);
+ } else {
+ if (this.stash.length > 0) {
+ this.flushStash();
+ }
+ this.results.push(item);
+ }
+ }
+
+ pushOnStash(item) {
+ this.onPushOnStash(item);
+ this.stash.push(item);
+ }
+
+ complete() {
+ if (this.stash.length > 0) {
+ this.flushStash();
+ }
+ return this.results;
+ }
+
+ // return true if the item matches the items of the stack
+ matchesStash(item) {
+ if (this.stash.length == 0) {
+ return true;
+ }
+ const lastItem = this.stash[this.stash.length - 1];
+ return this.doMatchesStash(lastItem, item);
+ }
+
+ flushStash() {
+ if (this.stash.length > 0) {
+ this.doFlushStash(this.stash, this.results);
+ this.stash = [];
+ }
+ }
+
+ onPushOnStash(item) { // eslint-disable-line no-unused-vars
+ //sub-classes may override
+ }
+
+ shouldStash(item) {
+ throw new TypeError("Do not call abstract method foo from child." + item);
+ }
+
+ doMatchesStash(lastItem, item) {
+ throw new TypeError("Do not call abstract method foo from child." + lastItem + item);
+ }
+
+ doFlushStash(stash, results) {
+ throw new TypeError("Do not call abstract method foo from child." + stash + results);
+ }
+}
\ No newline at end of file
diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx
index d013cc1..1565568 100644
--- a/src/javascript/models/TextItem.jsx
+++ b/src/javascript/models/TextItem.jsx
@@ -11,8 +11,6 @@ export default class TextItem extends PageItem {
this.height = options.height;
this.text = options.text;
this.font = options.font;
- this.fontAscent = options.fontAscent;
- this.fontDescent = options.fontDescent;
this.lineFormat = options.lineFormat;
this.unopenedFormat = options.unopenedFormat;
diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx
deleted file mode 100644
index 9e19266..0000000
--- a/src/javascript/models/TextItemBlock.jsx
+++ /dev/null
@@ -1,36 +0,0 @@
-import PageItem from './PageItem.jsx'
-import TextItem from './TextItem.jsx'
-
-// A block of TextItem[] within a Page
-export default class TextItemBlock extends PageItem {
-
- constructor(options) {
- super(options);
- this.textItems = [];
- if (options.textItems) {
- options.textItems.forEach(item => this.addTextItem(item));
- }
- }
-
- addTextItem(textItem:TextItem) {
- if (this.type && textItem.type && this.type !== textItem.type) {
- throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
- }
- if (!this.type) {
- this.type = textItem.type;
- }
- if (textItem.parsedElements) {
- if (this.parsedElements) {
- this.parsedElements.add(textItem.parsedElements);
- } else {
- this.parsedElements = textItem.parsedElements;
- }
- }
- const copiedTextItem = new TextItem({
- ...textItem
- });
- copiedTextItem.type = null;
- this.textItems.push(copiedTextItem);
- }
-
-}
diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx
deleted file mode 100644
index ace1083..0000000
--- a/src/javascript/models/TextItemLineCompactor.jsx
+++ /dev/null
@@ -1,227 +0,0 @@
-import TextItem from './TextItem.jsx';
-import { ParsedElements } from './PageItem.jsx';
-import { isNumber } from '../functions.jsx'
-import { sortByX } from '../textItemFunctions.jsx'
-import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
-
-// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
-//'whitespace removal', bold/emphasis annotation, link-detection, etc..
-export default class TextItemLineCompactor {
-
- constructor(fontToFormats) {
- this.fontToFormats = fontToFormats;
- }
-
- // returns a CombineResult
- compact(lineItems: TextItem[]) {
- if (lineItems.length < 2) {
- throw "Must be at least 2 line items, but was " + lineItems;
- }
-
- // we can't trust order of occurence, esp. footnoteLinks like to come last
- sortByX(lineItems);
-
- const formatter = new Formatter(this.fontToFormats);
- var [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
- resolvedLineItems.forEach(item => formatter.consume(item));
- resolvedLineItems = formatter.getResults();
- parsedElements.inlineFormats = formatter.inlineFormats;
- // const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
-
- var combinedItem;
- if (resolvedLineItems.length == 1) {
- combinedItem = resolvedLineItems[0];
- } else {
- var text = '';
- var maxHeight = 0;
- var widthSum = 0;
- var lastItem;
- resolvedLineItems.forEach(item => {
- if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
- const xDistance = item.x - lastItem.x - lastItem.width;
- if (xDistance >= 5) {
- text += ' ';
- }
- }
- text += item.text;
- widthSum += item.width;
- lastItem = item;
- maxHeight = Math.max(maxHeight, item.height);
- });
- combinedItem = new TextItem({
- ...resolvedLineItems[0],
- text: text,
- height: maxHeight,
- width: widthSum
- });
- }
- combinedItem.parsedElements = parsedElements;
- combinedItem.lineFormat = formatter.lineFormat;
- combinedItem.unopenedFormat = formatter.unopenedFormat;
- combinedItem.unclosedFormat = formatter.unclosedFormat;
- return combinedItem;
- }
-
-
- resolveSpecialElements(lineItems) {
- const footnoteLinks = [];
- const footnotes = [];
- const basicY = lineItems[0].y;
- const newLineItems = [];
- var stashedNumberItems = [];
-
- const commitStashedNumbers = (nextItem) => {
- if (stashedNumberItems.length > 0) {
- const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
- if (stashedNumberItems[0].y > basicY) { // footnote link
- newLineItems.push(new TextItem({
- ...stashedNumberItems[0],
- //TODO make fomatting configurable
- // text: `[${joinedNumber}](#${joinedNumber})`
- text: `^${joinedNumber}`
- }));
- footnoteLinks.push(parseInt(joinedNumber));
- } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
- //TODO womb comp [29] => ydiff == 0
- newLineItems.push(new TextItem({
- ...stashedNumberItems[0],
- text: `(^${ joinedNumber}): `
- }));
- footnotes.push(joinedNumber);
- } else {
- stashedNumberItems.forEach(number => newLineItems.push(number));
- }
-
- stashedNumberItems = [];
- }
- };
-
- lineItems.forEach(item => {
- if (newLineItems.length == 0 && item.text.trim().length == 0) {
- // skip whitespace on the beginning of a line
- } else {
- const isANumber = isNumber(item.text.trim());
- if (isANumber) {
- stashedNumberItems.push(item);
- } else {
- if (stashedNumberItems.length > 0) {
- commitStashedNumbers(item);
- }
- newLineItems.push(item);
- }
- }
- });
- commitStashedNumbers();
-
-
- return [newLineItems, new ParsedElements({
- footnoteLinks: footnoteLinks,
- footnotes: footnotes
- })];
- }
-
-}
-
-class Formatter {
-
- constructor(fontToFormats) {
- this.fontToFormats = fontToFormats;
-
- this.resultItems = [];
- this.lineFormat;
- this.unopenedFormat;
- this.unclosedFormat;
-
- this.openFormat;
- this.stashedItems = [];
- this.inlineFormats = 0;
- this.lastItem;
- }
-
-
- consume(item) {
- const formatType = this.fontToFormats.get(item.font);
- if (this.openFormat && formatType !== this.openFormat) {
- this.flushStash(false);
- }
- if (formatType.needFormat) {
- this.openFormat = formatType;
- this.stashedItems.push(item);
- } else {
- this.resultItems.push(item);
- }
- }
-
- getResults() {
- if (this.openFormat) {
- this.flushStash(true);
- }
- return this.resultItems;
- }
-
- flushStash(formatToEndOfLine) {
- const formatFromBeginningOfLine = this.resultItems == 0;
- if (formatFromBeginningOfLine) {
- if (formatToEndOfLine) {
- this.lineFormat = this.openFormat;
- this.moveStashItemsToResult();
- } else {
- this.unopenedFormat = this.openFormat;
- const newLastItem = this.newClosingItem(this.stashedItems.pop());
- this.moveStashItemsToResult();
- this.resultItems.push(newLastItem);
- }
- } else {
- if (formatToEndOfLine) {
- this.unclosedFormat = this.openFormat;
- const newFirstItem = this.newOpeningItem(this.stashedItems.shift());
- this.resultItems.push(newFirstItem);
- this.moveStashItemsToResult();
- } else {
- this.inlineFormats++;
- if (this.stashedItems.length == 1) {
- const onlyItem = this.stashedItems.pop();
- if (onlyItem.text.trim().length > 0) {
- const onlyItemFormatted = this.newCompleteItem(onlyItem);
- this.resultItems.push(onlyItemFormatted);
- }
- this.moveStashItemsToResult();
- } else {
- const firstItem = this.newOpeningItem(this.stashedItems.shift());
- const lastItem = this.newClosingItem(this.stashedItems.pop());
- this.resultItems.push(firstItem);
- this.moveStashItemsToResult();
- this.resultItems.push(lastItem);
- }
- }
- }
- }
-
- moveStashItemsToResult() {
- this.resultItems.push(...this.stashedItems);
- this.stashedItems = [];
- this.openFormat = null;
- }
-
- newOpeningItem(item) {
- return new TextItem({
- ...item,
- text: prefixAfterWhitespace(this.openFormat.startSymbol, item.text)
- });
- }
-
- newClosingItem(item) {
- return new TextItem({
- ...item,
- text: suffixBeforeWhitespace(item.text, this.openFormat.endSymbol)
- });
- }
-
- newCompleteItem(item) {
- return new TextItem({
- ...item,
- text: suffixBeforeWhitespace(prefixAfterWhitespace(this.openFormat.startSymbol, item.text), this.openFormat.endSymbol)
- });
- }
-
-}
diff --git a/src/javascript/models/TextItemLineGrouper.jsx b/src/javascript/models/TextItemLineGrouper.jsx
index 54264b5..379191f 100644
--- a/src/javascript/models/TextItemLineGrouper.jsx
+++ b/src/javascript/models/TextItemLineGrouper.jsx
@@ -1,5 +1,5 @@
import TextItem from './TextItem.jsx';
-import { sortByX } from '../textItemFunctions.jsx'
+import { sortByX } from '../pageItemFunctions.jsx'
//Groups all text items which are on the same y line
export default class TextItemLineGrouper {
diff --git a/src/javascript/models/Word.jsx b/src/javascript/models/Word.jsx
new file mode 100644
index 0000000..b559c6c
--- /dev/null
+++ b/src/javascript/models/Word.jsx
@@ -0,0 +1,8 @@
+export default class Word {
+
+ constructor(options) {
+ this.string = options.string;
+ this.type = options.type; // WordType
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/markdown/WordType.jsx b/src/javascript/models/markdown/WordType.jsx
new file mode 100644
index 0000000..64a531d
--- /dev/null
+++ b/src/javascript/models/markdown/WordType.jsx
@@ -0,0 +1,7 @@
+import { Enum } from 'enumify';
+
+// An Markdown word element
+export default class WordType extends Enum {
+}
+
+WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']);
\ No newline at end of file
diff --git a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx
similarity index 73%
rename from src/javascript/models/transformations/ToTextItemBlockTransformation.jsx
rename to src/javascript/models/transformations/ToLineItemBlockTransformation.jsx
index 43be1ec..92e2524 100644
--- a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx
+++ b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx
@@ -1,16 +1,16 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import TextItemBlock from '../TextItemBlock.jsx';
-import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
+import LineItemBlock from '../LineItemBlock.jsx';
+import LineItemBlockPageView from '../../components/debug/LineItemBlockPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
-// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
-export default class ToTextItemBlockTransformation extends Transformation {
+// Abstract class for transformations producing LineItemBlock(s) to be shown in the LineItemBlockPageView
+export default class ToLineItemBlockTransformation extends Transformation {
constructor(name) {
- super(name, TextItemBlock.name);
- if (this.constructor === ToTextItemBlockTransformation) {
+ super(name, LineItemBlock.name);
+ if (this.constructor === ToLineItemBlockTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
@@ -25,7 +25,7 @@ export default class ToTextItemBlockTransformation extends Transformation {
}
createPageView(page, modificationsOnly) {
- return ;
+ }
+
+ completeTransform(parseResult:ParseResult) {
+ // The usual cleanup
+ parseResult.messages = [];
+ parseResult.pages.forEach(page => {
+ page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
+ page.items.forEach(item => item.annotation = null);
+ });
+ return parseResult;
+ }
+
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
index e28d19f..7a06d00 100644
--- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
+++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
@@ -1,6 +1,7 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import StringFormat from '../../StringFormat.jsx';
+import WordType from '../../markdown/WordType.jsx';
+// import StringFormat from '../../StringFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation {
@@ -54,21 +55,21 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
this.fontMap.forEach(function(value, key) {
fontIdToName.push(key + " = " + value.name)
const fontName = value.name.toLowerCase();
- var format;
+ var type;
if (key == mostUsedFont) {
- format = StringFormat.STANDARD;
+ type = null;
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
- format = StringFormat.BOLD_OBLIQUE;
+ type = WordType.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) {
- format = StringFormat.BOLD;
+ type = WordType.BOLD;
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
- format = StringFormat.OBLIQUE;
+ type = WordType.OBLIQUE;
} else if (fontName === maxHeightFont) {
- format = StringFormat.BOLD;
- } else {
- format = StringFormat.STANDARD;
+ type = WordType.BOLD;
+ }
+ if (type) {
+ fontToFormats.set(key, type);
}
- fontToFormats.set(key, format);
});
fontIdToName.sort();
diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx
index 4b5b6f8..12446bb 100644
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@@ -1,16 +1,16 @@
import React from 'react';
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import { ParsedElements } from '../../PageItem.jsx';
+import LineItem from '../../LineItem.jsx';
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
-import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
+import LineConverter from '../../LineConverter.jsx';
import ElementType from '../../ElementType.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
-// gathers text items on the same y line to one text item
-export default class CompactLines extends ToTextItemTransformation {
+// gathers text items on the same y line to one line item
+export default class CompactLines extends ToLineItemTransformation {
constructor() {
super("Compact To Lines");
@@ -20,58 +20,44 @@ export default class CompactLines extends ToTextItemTransformation {
const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = [];
const foundFootnoteLinks = [];
- var inlineFormats = 0;
- var lineFormats = 0;
- var unopenedFormats = 0;
- var unclosedFormats = 0;
+ var formattedWords = 0;
const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance,
});
- const lineCompactor = new TextItemLineCompactor(fontToFormats);
+ const lineCompactor = new LineConverter(fontToFormats);
parseResult.pages.forEach(page => {
if (page.items.length > 0) {
- const newItems = [];
+ const lineItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items);
- textItemsGroupedByLine.forEach(textItemsOfLine => {
- var lineItem;
- if (textItemsOfLine.length == 1) {
- lineItem = textItemsOfLine[0];
- const formatType = fontToFormats.get(lineItem.font);
- if (formatType.needFormat) {
- lineItem.lineFormat = formatType;
- lineItem.parsedElements = new ParsedElements({
- completeLineFormats: 1
- });
- }
- } else {
- textItemsOfLine.forEach(item => {
- item.annotation = REMOVED_ANNOTATION;
- newItems.push(item);
- });
-
- lineItem = lineCompactor.compact(textItemsOfLine);
+ textItemsGroupedByLine.forEach(lineTextItems => {
+ const lineItem = lineCompactor.compact(lineTextItems);
+ if (lineTextItems.length > 1) {
lineItem.annotation = ADDED_ANNOTATION;
-
- if (lineItem.parsedElements.footnoteLinks.length > 0) {
- const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
- foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
- }
- if (lineItem.parsedElements.footnotes.length > 0) {
- lineItem.type = ElementType.FOOTNOTES;
- const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },);
- foundFootnotes.push.apply(foundFootnotes, footnotes);
- }
- inlineFormats += lineItem.parsedElements.inlineFormats;
+ lineTextItems.forEach(item => {
+ item.annotation = REMOVED_ANNOTATION;
+ lineItems.push(new LineItem({
+ ...item
+ }));
+ });
+ }
+ if (lineItem.words.length == 0) {
+ lineItem.annotation = REMOVED_ANNOTATION;
+ }
+ lineItems.push(lineItem);
+
+ if (lineItem.parsedElements.footnoteLinks.length > 0) {
+ const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
+ foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
+ }
+ if (lineItem.parsedElements.footnotes.length > 0) {
+ lineItem.type = ElementType.FOOTNOTES;
+ const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },);
+ foundFootnotes.push.apply(foundFootnotes, footnotes);
}
- if (lineItem.lineFormat) lineFormats++;
- if (lineItem.unopenedFormat) unopenedFormats++;
- if (lineItem.unclosedFormat) unclosedFormats++;
- lineItem.text = lineItem.text.trim();
- newItems.push(lineItem);
});
- page.items = newItems;
+ page.items = lineItems;
}
});
@@ -79,11 +65,8 @@ export default class CompactLines extends ToTextItemTransformation {
return new ParseResult({
...parseResult,
messages: [
- 'Detected ' + lineFormats + ' line formats',
- 'Detected ' + inlineFormats + ' inline formats',
- 'Detected ' + unclosedFormats + ' opened un-closed formats',
- 'Detected ' + unopenedFormats + ' un-opened closed formats',
- Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }],
+ 'Detected ' + formattedWords + ' formatted words',
+ Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }],
Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }],
]
});
diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx
index c073f34..e6ac7f8 100644
--- a/src/javascript/models/transformations/textitem/CompleteFormats.jsx
+++ b/src/javascript/models/transformations/textitem/CompleteFormats.jsx
@@ -6,6 +6,8 @@ import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../.
//Complete unopened/unclosed bold/italic formats
export default class CompleteFormats extends ToTextItemTransformation {
+ //TODO move to block and ignore quotes
+
constructor() {
super("Complete Bold/Italics");
}
@@ -81,7 +83,6 @@ class ItemStack {
}
consume(item) {
- const te = item.text;
var newItem;
const handleFreshUnopened = () => {
diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
index 6d3d20d..93d36ad 100644
--- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx
+++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
@@ -1,4 +1,4 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
@@ -6,7 +6,7 @@ import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem } from '../../../functions.jsx';
//Detect items starting with -, •, etc...
-export default class DetectHeaders extends ToTextItemTransformation {
+export default class DetectHeaders extends ToLineItemTransformation {
constructor() {
super("Detect Headers");
@@ -21,15 +21,15 @@ export default class DetectHeaders extends ToTextItemTransformation {
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
pagesWithMaxHeight.forEach(titlePage => {
- titlePage.items.forEach(textItem => {
- const height = textItem.height;
- if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
+ titlePage.items.forEach(item => {
+ const height = item.height;
+ if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
if (height == maxHeight) {
- textItem.type = ElementType.H1;
+ item.type = ElementType.H1;
} else {
- textItem.type = ElementType.H2;
+ item.type = ElementType.H2;
}
- textItem.annotation = DETECTED_ANNOTATION;
+ item.annotation = DETECTED_ANNOTATION;
detectedHeaders++;
}
});
@@ -41,10 +41,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
var range = headlineTypeToHeightRange[headlineType];
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => {
- page.items.forEach(textItem => {
- if (!textItem.type && textItem.height == range.max) {
- textItem.annotation = DETECTED_ANNOTATION;
- textItem.type = ElementType.enumValueOf(headlineType);
+ page.items.forEach(item => {
+ if (!item.type && item.height == range.max) {
+ item.annotation = DETECTED_ANNOTATION;
+ item.type = ElementType.enumValueOf(headlineType);
detectedHeaders++
}
});
@@ -56,10 +56,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
const heights = [];
var lastHeight;
parseResult.pages.forEach(page => {
- page.items.forEach(textItem => {
- if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
- if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
- heights.push(textItem.height);
+ page.items.forEach(item => {
+ if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
+ if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
+ heights.push(item.height);
}
}
});
@@ -69,11 +69,11 @@ export default class DetectHeaders extends ToTextItemTransformation {
heights.forEach((height, i) => {
const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => {
- page.items.forEach(textItem => {
- if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
+ page.items.forEach(item => {
+ if (!item.type && item.height == height && !isListItem(item.text())) {
detectedHeaders++;
- textItem.annotation = DETECTED_ANNOTATION;
- textItem.type = headlineType;
+ item.annotation = DETECTED_ANNOTATION;
+ item.type = headlineType;
}
});
});
@@ -83,9 +83,9 @@ export default class DetectHeaders extends ToTextItemTransformation {
//find headlines which have paragraph height
var smallesHeadlineLevel = 1;
parseResult.pages.forEach(page => {
- page.items.forEach(textItem => {
- if (textItem.type && textItem.type.headline) {
- smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
+ page.items.forEach(item => {
+ if (item.type && item.type.headline) {
+ smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
}
});
});
@@ -93,18 +93,18 @@ export default class DetectHeaders extends ToTextItemTransformation {
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
parseResult.pages.forEach(page => {
var lastItem;
- page.items.forEach(textItem => {
- if (!textItem.type
- && textItem.height == mostUsedHeight
- && textItem.font !== mostUsedFont
- && (!lastItem || lastItem.y < textItem.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - textItem.y > mostUsedDistance * 2))
- && textItem.text === textItem.text.toUpperCase()
+ page.items.forEach(item => {
+ if (!item.type
+ && item.height == mostUsedHeight
+ && item.font !== mostUsedFont
+ && (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
+ && item.text() === item.text().toUpperCase()
) {
detectedHeaders++;
- textItem.annotation = DETECTED_ANNOTATION;
- textItem.type = nextHeadlineType;
+ item.annotation = DETECTED_ANNOTATION;
+ item.type = nextHeadlineType;
}
- lastItem = textItem;
+ lastItem = item;
});
});
}
@@ -124,8 +124,8 @@ export default class DetectHeaders extends ToTextItemTransformation {
function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
- page.items.forEach(textItem => {
- if (!textItem.type && textItem.height == maxHeight) {
+ page.items.forEach(item => {
+ if (!item.type && item.height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
diff --git a/src/javascript/models/transformations/textitem/DetectListItems.jsx b/src/javascript/models/transformations/textitem/DetectListItems.jsx
index 0c99ee2..8bdd197 100644
--- a/src/javascript/models/transformations/textitem/DetectListItems.jsx
+++ b/src/javascript/models/transformations/textitem/DetectListItems.jsx
@@ -1,12 +1,12 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import TextItem from '../../TextItem.jsx';
+import LineItem from '../../LineItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
//Detect items starting with -, •, etc...
-export default class DetectListItems extends ToTextItemTransformation {
+export default class DetectListItems extends ToLineItemTransformation {
constructor() {
super("Detect List Items");
@@ -16,34 +16,34 @@ export default class DetectListItems extends ToTextItemTransformation {
var foundListItems = 0;
var foundNumberedItems = 0;
parseResult.pages.forEach(page => {
- const newTextItems = [];
- page.items.forEach(textItem => {
- newTextItems.push(textItem);
- if (!textItem.type) {
- var text = textItem.text;
+ const newItems = [];
+ page.items.forEach(item => {
+ newItems.push(item);
+ if (!item.type) {
+ var text = item.text();
if (isListItem(text)) {
foundListItems++
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
if (textWithDash === text) {
- textItem.annotation = DETECTED_ANNOTATION;
- textItem.type = ElementType.LIST;
+ item.annotation = DETECTED_ANNOTATION;
+ item.type = ElementType.LIST;
} else {
- textItem.annotation = REMOVED_ANNOTATION;
- newTextItems.push(new TextItem({
- ...textItem,
+ item.annotation = REMOVED_ANNOTATION;
+ newItems.push(new LineItem({
+ ...item,
text: textWithDash,
annotation: ADDED_ANNOTATION,
type: ElementType.LIST
}));
}
- } else if (isNumberedListItem(text)) {
+ } else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
foundNumberedItems++;
- textItem.annotation = DETECTED_ANNOTATION;
- textItem.type = ElementType.LIST;
+ item.annotation = DETECTED_ANNOTATION;
+ item.type = ElementType.LIST;
}
}
});
- page.items = newTextItems;
+ page.items = newItems;
});
return new ParseResult({
diff --git a/src/javascript/models/transformations/textitem/DetectTOC.jsx b/src/javascript/models/transformations/textitem/DetectTOC.jsx
index ff9ecf3..93ca758 100644
--- a/src/javascript/models/transformations/textitem/DetectTOC.jsx
+++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx
@@ -1,14 +1,15 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import TextItem from '../../TextItem.jsx';
+import LineItem from '../../LineItem.jsx';
+import Word from '../../Word.jsx';
import HeadlineFinder from '../../HeadlineFinder.jsx';
-import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx';
-import { isDigit, wordMatch } from '../../../functions.jsx'
+import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx'
-//Detect table of contents pages
-export default class DetectTOC extends ToTextItemTransformation {
+//Detect table of contents pages plus linked headlines
+export default class DetectTOC extends ToLineItemTransformation {
constructor() {
super("Detect TOC");
@@ -17,64 +18,68 @@ export default class DetectTOC extends ToTextItemTransformation {
transform(parseResult:ParseResult) {
const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
-
const linkLeveler = new LinkLeveler();
+
+
var tocLinks = [];
var lastTocPage;
var headlineItem;
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
- const lineItemsWithDigits = [];
+ var lineItemsWithDigits = 0;
const unknownLines = new Set();
const pageTocLinks = [];
- var lastLineTextWithoutNumber;
+ var lastWordsWithoutNumber;
var lastLine;
+ //find lines ending with a number per page
page.items.forEach(line => {
- var lineText = line.text.replace(/\./g, '').trim();
- var endsWithDigit = false;
- var digits = [];
- while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
- digits.unshift(lineText.charAt(lineText.length - 1));
- lineText = lineText.substring(0, lineText.length - 1);
- endsWithDigit = true;
+ var words = line.words.filter(word => !hasOnly(word.string, '.'));
+ const digits = [];
+ while (words.length > 0 && isNumber(words[words.length - 1].string)) {
+ const lastWord = words.pop();
+ digits.unshift(lastWord.string);
}
- lineText = lineText.trim();
+
+ if (digits.length == 0 && words.length > 0) {
+ const lastWord = words[words.length - 1];
+ while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
+ digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
+ lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1);
+ }
+ }
+ var endsWithDigit = digits.length > 0;
if (endsWithDigit) {
endsWithDigit = true;
- if (lastLineTextWithoutNumber) { // 2-line item ?
- lineText = lastLineTextWithoutNumber + ' ' + lineText;
- lastLineTextWithoutNumber = null;
+ if (lastWordsWithoutNumber) { // 2-line item ?
+ words.push(...lastWordsWithoutNumber);
+ lastWordsWithoutNumber = null;
}
pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')),
- textItem: new TextItem({
+ lineItem: new LineItem({
...line,
- text: lineText
+ words: words
})
}));
- lineItemsWithDigits.push(new TextItem({
- ...line,
- text: lineText
- }));
- lastLineTextWithoutNumber = null;
+ lineItemsWithDigits++;
} else {
if (!headlineItem) {
headlineItem = line;
} else {
- if (lastLineTextWithoutNumber) {
+ if (lastWordsWithoutNumber) {
unknownLines.add(lastLine);
}
- lastLineTextWithoutNumber = lineText;
+ lastWordsWithoutNumber = words;
lastLine = line;
}
}
});
// page has been processed
- if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
+ if (lineItemsWithDigits * 100 / page.items.length > 75) {
tocPages.push(page.index + 1);
lastTocPage = page;
linkLeveler.levelPageItems(pageTocLinks);
- tocLinks = tocLinks.concat(pageTocLinks);
+ tocLinks.push(...pageTocLinks);
const newBlocks = [];
page.items.forEach((line) => {
@@ -83,7 +88,7 @@ export default class DetectTOC extends ToTextItemTransformation {
}
newBlocks.push(line);
if (line === headlineItem) {
- newBlocks.push(new TextItem({
+ newBlocks.push(new LineItem({
...line,
type: ElementType.H2,
annotation: ADDED_ANNOTATION
@@ -105,8 +110,10 @@ export default class DetectTOC extends ToTextItemTransformation {
if (tocPages.length > 0) {
// Add TOC items
tocLinks.forEach(tocLink => {
- lastTocPage.items.push(new TextItem({
- text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
+ lastTocPage.items.push(new LineItem({
+ words: [new Word({
+ string: ' '.repeat(tocLink.level * 3) + '-'
+ })].concat(tocLink.lineItem.words),
type: ElementType.TOC,
annotation: ADDED_ANNOTATION
}));
@@ -118,11 +125,11 @@ export default class DetectTOC extends ToTextItemTransformation {
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
var foundHealineItems;
if (linkedPage) {
- foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
+ foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
if (!foundHealineItems) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
if (linkedPage) {
- foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
+ foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
}
}
}
@@ -142,11 +149,16 @@ export default class DetectTOC extends ToTextItemTransformation {
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
const heightRange = headlineTypeToHeightRange[headlineType.name];
if (heightRange) {
- const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
- if (textItem) {
- textItem.type = headlineType;
- textItem.annotation = DETECTED_ANNOTATION;
- foundBySize.push(textItem.text);
+ const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
+ if (lineIndex > -1) {
+ const page = parseResult.pages[pageIndex];
+ page.items[lineIndex].annotation = REMOVED_ANNOTATION;
+ page.items.splice(lineIndex + 1, 0, new LineItem({
+ ...notFoundTocLink.lineItem,
+ type: headlineType,
+ annotation: ADDED_ANNOTATION,
+ }));
+ foundBySize.push(notFoundTocLink);
}
}
});
@@ -173,12 +185,12 @@ export default class DetectTOC extends ToTextItemTransformation {
const messages = [];
messages.push('Detected ' + tocPages.length + ' table of content pages');
if (tocPages.length > 0) {
- messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
+ messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
}
if (notFoundHeadlines.length > 0) {
- messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
- messages.push('Found TOC headlines (by size): ' + foundBySize);
+ messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()));
+ messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber));
}
return new ParseResult({
...parseResult,
@@ -196,7 +208,7 @@ export default class DetectTOC extends ToTextItemTransformation {
//Find out how the TOC page link actualy translates to the page.index
function detectPageMappingNumber(pages, tocLinks) {
for ( var tocLink of tocLinks ) {
- const page = findPageWithHeadline(pages, tocLink.textItem.text);
+ const page = findPageWithHeadline(pages, tocLink.lineItem.text());
if (page) {
return page.index - tocLink.pageNumber;
}
@@ -235,9 +247,9 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
const headlineType = headlineByLevel(tocLink.level + 2);
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
- page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({
+ page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
...foundItems.headlineItems[0],
- text: tocLink.textItem.text,
+ words: tocLink.lineItem.words,
height: headlineHeight,
type: headlineType,
annotation: ADDED_ANNOTATION
@@ -255,21 +267,22 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
}
}
-function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
+function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) {
+ const linkText = tocLink.lineItem.text().toUpperCase();
for (var i = fromPage; i <= toPage; i++) {
const page = pages[i - 1];
- for ( var line of page.items ) {
+ const lineIndex = page.items.findIndex(line => {
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
- const match = wordMatch(tocLink.textItem.text, line.text);
- if (match >= 0.5) {
- return line;
- }
+ const match = wordMatch(linkText, line.text());
+ return match >= 0.5;
}
- }
+ return false;
+ });
+ if (lineIndex > -1) return [i - 1, lineIndex];
}
+ return [-1, -1];
}
-
class LinkLeveler {
constructor() {
this.levelByMethod = null;
@@ -297,13 +310,13 @@ class LinkLeveler {
levelByXDiff(tocLinks) {
const uniqueX = this.calculateUniqueX(tocLinks);
tocLinks.forEach(link => {
- link.level = uniqueX.indexOf(link.textItem.x);
+ link.level = uniqueX.indexOf(link.lineItem.x);
});
}
levelByFont(tocLinks) {
tocLinks.forEach(link => {
- link.level = this.uniqueFonts.indexOf(link.textItem.font);
+ link.level = this.uniqueFonts.indexOf(link.lineItem.font);
});
}
@@ -315,7 +328,7 @@ class LinkLeveler {
calculateUniqueX(tocLinks) {
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
- if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
+ if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x);
return uniquesArray;
}, []);
@@ -328,7 +341,7 @@ class LinkLeveler {
calculateUniqueFonts(tocLinks) {
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
- if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
+ if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font);
return uniquesArray;
}, []);
@@ -339,7 +352,7 @@ class LinkLeveler {
class TocLink {
constructor(options) {
- this.textItem = options.textItem;
+ this.lineItem = options.lineItem;
this.pageNumber = options.pageNumber;
this.level = 0;
}
diff --git a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx
index 3d56883..d84f82e 100644
--- a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx
+++ b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx
@@ -1,4 +1,4 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
@@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
-export default class RemoveRepetitiveElements extends ToTextItemTransformation {
+export default class RemoveRepetitiveElements extends ToLineItemTransformation {
constructor() {
super("Remove Repetitive Elements");
@@ -58,8 +58,8 @@ export default class RemoveRepetitiveElements extends ToTextItemTransformation {
maxElements: []
});
- const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
- const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
+ const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
+ const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
pageStore.push({
minElements: minMaxItems.minElements,
maxElements: minMaxItems.maxElements,
diff --git a/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx b/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx
index 6290d22..649df96 100644
--- a/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx
+++ b/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx
@@ -1,10 +1,11 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import TextItem from '../../TextItem.jsx';
+import LineItem from '../../LineItem.jsx';
+import StashingStream from '../../StashingStream.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// Converts vertical text to horizontal
-export default class VerticalToHorizontal extends ToTextItemTransformation {
+export default class VerticalToHorizontal extends ToLineItemTransformation {
constructor() {
super("Vertical to Horizontal Text");
@@ -12,87 +13,64 @@ export default class VerticalToHorizontal extends ToTextItemTransformation {
transform(parseResult:ParseResult) {
var foundVerticals = 0;
- const newPages = parseResult.pages.map(page => {
- const newTextItems = [];
- // var oneCharacterItems = [];
-
- // const applyTransformation = () => {
- // oneCharacterItems.forEach(item => {
- // item.annotation = REMOVED_ANNOTATION;
- // newTextItems.push(item);
- // //TODO add new
- // });
- // oneCharacterItems = [];
- // };
- // const rollbackTransformation = () => {
- // oneCharacterItems.forEach(item => {
- // newTextItems.push(item);
- // });
- // oneCharacterItems = [];
- // };
-
- //TODO generic state machine code ?
-
- const leftOver = page.items.reduce((oneCharacterItems, item) => {
- if (item.text.trim().length == 1) {
- if (oneCharacterItems.length == 0) {
- oneCharacterItems.push(item);
- } else {
- const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
- if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
- oneCharacterItems.push(item);
- } else {
- if (oneCharacterItems.length > 5) {
- var combinedText = '';
- var minX = 999;
- var maxY = 0;
- var sumWidth = 0;
- var maxHeight = 0;
- oneCharacterItems.forEach(oneCharacterItem => {
- oneCharacterItem.annotation = REMOVED_ANNOTATION;
- newTextItems.push(oneCharacterItem);
- combinedText += oneCharacterItem.text.trim();
- minX = Math.min(minX, oneCharacterItem.x);
- maxY = Math.max(maxY, oneCharacterItem.y);
- sumWidth += oneCharacterItem.width;
- maxHeight = Math.max(maxHeight, oneCharacterItem.height);
- });
- newTextItems.push(new TextItem({
- ...oneCharacterItems[0],
- x: minX,
- y: maxY,
- width: sumWidth,
- height: maxHeight,
- text: combinedText,
- annotation: ADDED_ANNOTATION
- }));
- foundVerticals++;
- } else {
- oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
- }
- oneCharacterItems = [item];
- }
- }
- } else {
- oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
- oneCharacterItems = [];
- newTextItems.push(item);
- }
- return oneCharacterItems;
- }, []);
- leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
-
- return {
- ...page,
- items: newTextItems
- };
+ parseResult.pages.forEach(page => {
+ const stream = new VerticalsStream();
+ stream.consumeAll(page.items);
+ page.items = stream.complete();
+ foundVerticals += stream.foundVerticals;
});
+
return new ParseResult({
...parseResult,
- pages: newPages,
messages: ["Converted " + foundVerticals + " verticals"]
});
}
-
+}
+
+class VerticalsStream extends StashingStream {
+
+ constructor() {
+ super();
+ this.foundVerticals = 0;
+ }
+
+ shouldStash(item) {
+ return item.words.length == 1 && item.words[0].string.length == 1;
+ }
+
+ doMatchesStash(lastItem, item) {
+ return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type;
+ }
+
+ doFlushStash(stash, results) {
+ if (stash.length > 5) { // unite
+ var combinedWords = [];
+ var minX = 999;
+ var maxY = 0;
+ var sumWidth = 0;
+ var maxHeight = 0;
+ stash.forEach(oneCharacterLine => {
+ oneCharacterLine.annotation = REMOVED_ANNOTATION;
+ results.push(oneCharacterLine);
+ combinedWords.push(oneCharacterLine.words[0]);
+ minX = Math.min(minX, oneCharacterLine.x);
+ maxY = Math.max(maxY, oneCharacterLine.y);
+ sumWidth += oneCharacterLine.width;
+ maxHeight = Math.max(maxHeight, oneCharacterLine.height);
+ });
+ results.push(new LineItem({
+ ...stash[0],
+ x: minX,
+ y: maxY,
+ width: sumWidth,
+ height: maxHeight,
+ words: combinedWords,
+ annotation: ADDED_ANNOTATION
+ }));
+ this.foundVerticals++;
+ } else { //add as singles
+ results.push(...stash);
+ }
+ }
}
diff --git a/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx b/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx
index 1f96b88..6b0dd45 100644
--- a/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx
+++ b/src/javascript/models/transformations/textitemblock/DetectCodeQuoteBlocks.jsx
@@ -1,11 +1,11 @@
-import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
+import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
-import { minXFromBlocks } from '../../../textItemFunctions.jsx';
+import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
//Detect items which are code/quote blocks
-export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
+export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation {
constructor() {
super("Detect Code/Quote Blocks");
@@ -17,7 +17,7 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
page.items.forEach(block => {
- if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
+ if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
block.annotation = DETECTED_ANNOTATION;
block.type = ElementType.CODE;
foundCodeItems++;
@@ -36,14 +36,14 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
}
-function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
- if (textItems.length == 0) {
+function looksLikeCodeBlock(minX, items, mostUsedHeight) {
+ if (items.length == 0) {
return false;
}
- if (textItems.length == 1) {
- return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
+ if (items.length == 1) {
+ return items[0].x > minX && items[0].height <= mostUsedHeight + 1;
}
- for ( var item of textItems ) {
+ for ( var item of items ) {
if (item.x == minX) {
return false;
}
diff --git a/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx b/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx
index e058c2b..eb766d5 100644
--- a/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx
+++ b/src/javascript/models/transformations/textitemblock/DetectListLevels.jsx
@@ -1,10 +1,11 @@
-import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
+import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
+import Word from '../../Word.jsx';
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
// Cares for proper sub-item spacing/leveling
-export default class DetectListLevels extends ToTextItemBlockTransformation {
+export default class DetectListLevels extends ToLineItemBlockTransformation {
constructor() {
super("Level Lists");
@@ -21,23 +22,25 @@ export default class DetectListLevels extends ToTextItemBlockTransformation {
var currentLevel = 0;
const xByLevel = {};
var modifiedBlock = false;
- listBlock.textItems.forEach(textItem => {
+ listBlock.items.forEach(item => {
const isListItem = true;
if (lastItemX && isListItem) {
- if (textItem.x > lastItemX) {
+ if (item.x > lastItemX) {
currentLevel++;
- xByLevel[textItem.x] = currentLevel;
- } else if (textItem.x < lastItemX) {
- currentLevel = xByLevel[textItem.x];
+ xByLevel[item.x] = currentLevel;
+ } else if (item.x < lastItemX) {
+ currentLevel = xByLevel[item.x];
}
} else {
- xByLevel[textItem.x] = 0;
+ xByLevel[item.x] = 0;
}
if (currentLevel > 0) {
- textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
+ item.words = [new Word({
+ string: ' '.repeat(currentLevel * 3)
+ })].concat(item.words);
modifiedBlock = true;
}
- lastItemX = textItem.x;
+ lastItemX = item.x;
});
listBlocks++;
if (modifiedBlock) {
diff --git a/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx b/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx
index 0589739..6e179f2 100644
--- a/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx
+++ b/src/javascript/models/transformations/textitemblock/GatherBlocks.jsx
@@ -1,11 +1,11 @@
-import ToTextItemBlockTransformation from '../ToTextItemBlockTransformation.jsx';
+import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
-import TextItemBlock from '../../TextItemBlock.jsx';
+import LineItemBlock from '../../LineItemBlock.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
-import { minXFromTextItems } from '../../../textItemFunctions.jsx';
+import { minXFromPageItems } from '../../../pageItemFunctions.jsx';
// Gathers lines to blocks
-export default class GatherBlocks extends ToTextItemBlockTransformation {
+export default class GatherBlocks extends ToLineItemBlockTransformation {
constructor() {
super("Gather Blocks");
@@ -14,29 +14,29 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
- var textItems = 0;
+ var lineItemCount = 0;
parseResult.pages.map(page => {
- textItems += page.items.length;
+ lineItemCount += page.items.length;
const blocks = [];
- var stashedBlock = new TextItemBlock({});
+ var stashedBlock = new LineItemBlock({});
const flushStashedItems = () => {
- if (stashedBlock.textItems.length > 1) {
+ if (stashedBlock.items.length > 1) {
stashedBlock.annotation = DETECTED_ANNOTATION;
}
blocks.push(stashedBlock);
- stashedBlock = new TextItemBlock({});
+ stashedBlock = new LineItemBlock({});
createdBlocks++;
};
- var minX = minXFromTextItems(page.items);
+ var minX = minXFromPageItems(page.items);
page.items.forEach(item => {
- if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
+ if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
flushStashedItems();
}
- stashedBlock.addTextItem(item);
+ stashedBlock.addItem(item);
});
- if (stashedBlock.textItems.length > 0) {
+ if (stashedBlock.items.length > 0) {
flushStashedItems();
}
page.items = blocks;
@@ -44,7 +44,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
return new ParseResult({
...parseResult,
- messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
+ messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items']
});
}
@@ -54,7 +54,7 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
return false;
}
- const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
+ const lastItem = stashedBlock.items[stashedBlock.items.length - 1];
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
return false;
diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/pageItemFunctions.jsx
similarity index 58%
rename from src/javascript/textItemFunctions.jsx
rename to src/javascript/pageItemFunctions.jsx
index 8c33ffc..562e262 100644
--- a/src/javascript/textItemFunctions.jsx
+++ b/src/javascript/pageItemFunctions.jsx
@@ -1,10 +1,10 @@
-import TextItemBlock from './models/TextItemBlock.jsx';
-import TextItem from './models/TextItem.jsx';
+import PageItem from './models/PageItem.jsx';
+import LineItemBlock from './models/LineItemBlock.jsx';
-export function minXFromBlocks(blocks:TextItemBlock[]) {
+export function minXFromBlocks(blocks:LineItemBlock[]) {
var minX = 999;
blocks.forEach(block => {
- block.textItems.forEach(item => {
+ block.items.forEach(item => {
minX = Math.min(minX, item.x)
});
});
@@ -14,7 +14,7 @@ export function minXFromBlocks(blocks:TextItemBlock[]) {
return minX;
}
-export function minXFromTextItems(items:TextItem) {
+export function minXFromPageItems(items:PageItem) {
var minX = 999;
items.forEach(item => {
minX = Math.min(minX, item.x)
@@ -25,13 +25,13 @@ export function minXFromTextItems(items:TextItem) {
return minX;
}
-export function sortByX(items:TextItem) {
+export function sortByX(items:PageItem) {
items.sort((a, b) => {
return a.x - b.x;
});
}
-export function sortCopyByX(items:TextItem) {
+export function sortCopyByX(items:PageItem) {
const copy = items.concat();
sortByX(copy);
return copy;
diff --git a/test/HeadlineFinder.spec.js b/test/HeadlineFinder.spec.js
index 629c4b0..22bf00a 100644
--- a/test/HeadlineFinder.spec.js
+++ b/test/HeadlineFinder.spec.js
@@ -1,31 +1,30 @@
import { expect } from 'chai';
import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
-import TextItem from '../src/javascript/models/TextItem.jsx';
+import LineItem from '../src/javascript/models/LineItem.jsx';
describe('HeadlineFinder', () => {
-
it('Not Found - Case 1', () => {
const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline'
});
- const item1 = new TextItem({
+ const item1 = new LineItem({
text: 'My '
});
- const item2 = new TextItem({
+ const item2 = new LineItem({
text: 'Little'
});
- const item3 = new TextItem({
+ const item3 = new LineItem({
text: ' Headline2'
});
expect(headlineFinder.consume(item1)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
});
@@ -33,22 +32,22 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline'
});
- const item1 = new TextItem({
+ const item1 = new LineItem({
text: 'My '
});
- const item2 = new TextItem({
+ const item2 = new LineItem({
text: 'Little'
});
- const item3 = new TextItem({
+ const item3 = new LineItem({
text: ' Headline'
});
expect(headlineFinder.consume(item1)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
});
@@ -56,27 +55,27 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline'
});
- const item0 = new TextItem({
+ const item0 = new LineItem({
text: 'Waste '
});
- const item1 = new TextItem({
+ const item1 = new LineItem({
text: 'My '
});
- const item2 = new TextItem({
+ const item2 = new LineItem({
text: 'Little'
});
- const item3 = new TextItem({
+ const item3 = new LineItem({
text: ' Headline'
});
expect(headlineFinder.consume(item0)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
expect(headlineFinder.consume(item1)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
});
@@ -84,27 +83,27 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({
headline: 'My Little Headline'
});
- const item0 = new TextItem({
+ const item0 = new LineItem({
text: 'My '
});
- const item1 = new TextItem({
+ const item1 = new LineItem({
text: 'My '
});
- const item2 = new TextItem({
+ const item2 = new LineItem({
text: 'Little'
});
- const item3 = new TextItem({
+ const item3 = new LineItem({
text: ' Headline'
});
expect(headlineFinder.consume(item0)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item0);
expect(headlineFinder.consume(item1)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
});
@@ -112,22 +111,22 @@ describe('HeadlineFinder', () => {
const headlineFinder = new HeadlineFinder({
headline: 'MYLitt le HEADline'
});
- const item1 = new TextItem({
+ const item1 = new LineItem({
text: 'My '
});
- const item2 = new TextItem({
+ const item2 = new LineItem({
text: 'Little'
});
- const item3 = new TextItem({
+ const item3 = new LineItem({
text: ' Headline'
});
expect(headlineFinder.consume(item1)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
expect(headlineFinder.consume(item2)).to.equal(null);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
- expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+ expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
});
diff --git a/test/functions.spec.js b/test/functions.spec.js
index 221b04c..8aa553e 100644
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@@ -2,9 +2,10 @@ import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
-describe('hasUpperCaseCharacterInMiddleOfWord', () => {
+describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
it('single word', () => {
+
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
@@ -38,7 +39,7 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
});
});
-describe('removeLeadingWhitespaces', () => {
+describe('functions: removeLeadingWhitespaces', () => {
it('No Removes', () => {
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
@@ -54,7 +55,7 @@ describe('removeLeadingWhitespaces', () => {
});
-describe('removeTrailingWhitespaces', () => {
+describe('functions: removeTrailingWhitespaces', () => {
it('No Removes', () => {
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
@@ -71,7 +72,7 @@ describe('removeTrailingWhitespaces', () => {
});
-describe('prefixAfterWhitespace', () => {
+describe('functions: prefixAfterWhitespace', () => {
it('Basic', () => {
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
@@ -81,7 +82,7 @@ describe('prefixAfterWhitespace', () => {
});
});
-describe('suffixBeforeWhitespace', () => {
+describe('functions: suffixBeforeWhitespace', () => {
it('Basic', () => {
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
@@ -92,7 +93,7 @@ describe('suffixBeforeWhitespace', () => {
});
-describe('charCodeArray', () => {
+describe('functions: charCodeArray', () => {
it('Charcodes', () => {
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
});
@@ -105,7 +106,7 @@ describe('charCodeArray', () => {
});
-describe('normalizedCharCodeArray', () => {
+describe('functions: normalizedCharCodeArray', () => {
it('No Change', () => {
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
@@ -131,7 +132,7 @@ describe('normalizedCharCodeArray', () => {
});
-describe('isListItem', () => {
+describe('functions: isListItem', () => {
it('Match', () => {
expect(isListItem('- my text')).to.equal(true);
@@ -154,7 +155,7 @@ describe('isListItem', () => {
});
-describe('isNumberedListItem', () => {
+describe('functions: isNumberedListItem', () => {
it('Match', () => {
expect(isNumberedListItem('1. my text')).to.equal(true);
@@ -173,7 +174,7 @@ describe('isNumberedListItem', () => {
});
-describe('wordsMatch', () => {
+describe('functions: wordsMatch', () => {
it('Match', () => {
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
diff --git a/test/models/StashingStream.spec.js b/test/models/StashingStream.spec.js
new file mode 100644
index 0000000..538a4a9
--- /dev/null
+++ b/test/models/StashingStream.spec.js
@@ -0,0 +1,64 @@
+import { expect } from 'chai';
+
+import StashingStream from '../../src/javascript/models/StashingStream';
+import TextItem from '../../src/javascript/models/TextItem.jsx';
+
+describe('StashingStream', () => {
+
+ it('Simple', () => {
+ const stream = new MyStashingStream();
+
+ stream.consume('a');
+ stream.consume('b');
+ stream.consume('a');
+ stream.consume('a');
+ stream.consume('z');
+ stream.consume('m');
+ stream.consume('m');
+ stream.consume('z');
+ stream.consume('z');
+ stream.consume('c');
+ stream.consume('e');
+ stream.consume('f');
+ stream.consume('m');
+ stream.consume('a');
+
+ const resultsAsString = stream.complete().join('');
+
+ expect(resultsAsString).to.equal('AbAAZZZcefA');
+ expect(stream.transformedItems).to.equal(10);
+ });
+
+ it('ConsumeAll', () => {
+ const items = ['k', 'k', 'x', 'a', 'm', 'z', 'o', 'p']
+ const stream = new MyStashingStream();
+ stream.consumeAll(items);
+
+ const resultsAsString = stream.complete().join('');
+ expect(resultsAsString).to.equal('kkxAZop');
+ expect(stream.transformedItems).to.equal(3);
+ });
+
+});
+
+
+class MyStashingStream extends StashingStream {
+
+ constructor() {
+ super();
+ this.transformedItems = 0;
+ }
+
+ shouldStash(item) {
+ return item === 'a' || item === 'z' || item === 'm';
+ }
+
+ doMatchesStash(lastItem, item) {
+ return lastItem === item;
+ }
+
+ doFlushStash(stash, results) {
+ this.transformedItems += stash.length;
+ results.push(...stash.filter(elem => elem !== 'm').map(item => item.toUpperCase()));
+ }
+}
\ No newline at end of file