mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
WIP Introduce word/wordType/lineItem
* Way to do the markdown transformation of inline formats (bold, italic, link, footnote, etc..) at the end and not in the middle * Introduce StashingStream as a helper
This commit is contained in:
parent
fde670e83f
commit
09facb09b4
@ -7,7 +7,7 @@
|
||||
"watch": "webpack -d --watch",
|
||||
"build": "webpack",
|
||||
"lint": "eslint src --ext .js --ext .jsx --cache",
|
||||
"test": "mocha --compilers js:babel-core/register test/*.spec.js",
|
||||
"test": "mocha --compilers js:babel-core/register test --recursive",
|
||||
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
||||
"deploy": "npm run release && cp -r build/* docs/"
|
||||
},
|
||||
|
@ -1,13 +1,12 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
import LineItemTable from './LineItemTable.jsx';
|
||||
|
||||
// View for a Page which items are of kind TextItemBlock
|
||||
export default class TextItemBlockPageView extends PageView {
|
||||
// View for a Page which items are of kind LineItemBlock
|
||||
export default class LineItemBlockPageView extends PageView {
|
||||
|
||||
createItemViews(items, showWhitespaces) {
|
||||
const blockTables = items.map((block, i) => {
|
||||
var textItems = block.textItems;
|
||||
const blockType = block.type ? ' - ' + block.type.name : null;
|
||||
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
||||
: null;
|
||||
@ -38,7 +37,7 @@ export default class TextItemBlockPageView extends PageView {
|
||||
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
|
||||
</div>
|
||||
<div style={ borderStyle }>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
<LineItemTable items={ block.items } showWhitespaces={ showWhitespaces } />
|
||||
{ footnoteLinks }
|
||||
{ footnotes }
|
||||
</div>
|
12
src/javascript/components/debug/LineItemPageView.jsx
Normal file
12
src/javascript/components/debug/LineItemPageView.jsx
Normal file
@ -0,0 +1,12 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import LineItemTable from './LineItemTable.jsx';
|
||||
|
||||
// View for a Page which items are of kind LineItem
|
||||
export default class LineItemPageView extends PageView {
|
||||
|
||||
createItemViews(items, showWhitespaces) {
|
||||
return <LineItemTable items={ items } showWhitespaces={ showWhitespaces } />
|
||||
}
|
||||
|
||||
}
|
108
src/javascript/components/debug/LineItemTable.jsx
Normal file
108
src/javascript/components/debug/LineItemTable.jsx
Normal file
@ -0,0 +1,108 @@
|
||||
import React from 'react';
|
||||
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
// Displays an array of LineItem as a table
|
||||
export default class LineItemTable extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
items: React.PropTypes.array.isRequired,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
render() {
|
||||
const {showWhitespaces, items} = this.props;
|
||||
const tableHeader = <thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
<th>
|
||||
X
|
||||
</th>
|
||||
<th>
|
||||
Y
|
||||
</th>
|
||||
<th>
|
||||
Width
|
||||
</th>
|
||||
<th>
|
||||
Height
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
|
||||
const itemRows = items.map((item, i) => <tr key={ i } style={ item.annotation ? {
|
||||
color: item.annotation.color
|
||||
} : null }>
|
||||
<td>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ i }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ item.annotation ? item.annotation.category : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'brown' } }>
|
||||
{ item.type ? item.type.name : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||
{ item.parsedElements && item.parsedElements.footnoteLinks.length > 0 ? <div>
|
||||
Footnote-Link
|
||||
</div> : '' }
|
||||
{ item.parsedElements && item.parsedElements.containLinks ? <div>
|
||||
Link
|
||||
</div> : '' }
|
||||
{ item.lineFormat ? <div>
|
||||
{ item.lineFormat.name }
|
||||
</div> : '' }
|
||||
{ item.unopenedFormat ? <div>
|
||||
Unopened
|
||||
{ ' ' + item.unopenedFormat.name }
|
||||
</div> : '' }
|
||||
{ item.parsedElements && item.parsedElements.inlineFormats > 0 ? <div>
|
||||
{ item.parsedElements.inlineFormats + 'x Bold/Italic' }
|
||||
</div> : '' }
|
||||
{ item.unclosedFormat ? <div>
|
||||
Unclosed
|
||||
{ ' ' + item.unclosedFormat.name }
|
||||
</div> : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
{ showWhitespaces ? (
|
||||
<pre style={ item.annotation ? {
|
||||
color: item.annotation.color,
|
||||
display: 'inline-block',
|
||||
} : {
|
||||
display: 'inline-block'
|
||||
} }>{ item.text() }</pre>
|
||||
) : (item.text()) }
|
||||
</td>
|
||||
<td>
|
||||
{ item.x }
|
||||
</td>
|
||||
<td>
|
||||
{ item.y }
|
||||
</td>
|
||||
<td>
|
||||
{ item.width }
|
||||
</td>
|
||||
<td>
|
||||
{ item.height }
|
||||
</td>
|
||||
</tr>
|
||||
)
|
||||
|
||||
return (
|
||||
<Table responsive condensed bordered>
|
||||
{ tableHeader }
|
||||
<tbody>
|
||||
{ itemRows }
|
||||
</tbody>
|
||||
</Table>
|
||||
);
|
||||
}
|
||||
}
|
@ -18,6 +18,17 @@ export function isNumber(string) {
|
||||
return true;
|
||||
}
|
||||
|
||||
export function hasOnly(string, char) {
|
||||
const charCode = char.charCodeAt(0);
|
||||
for (var i = 0; i < string.length; i++) {
|
||||
const aCharCode = string.charCodeAt(i);
|
||||
if (aCharCode != charCode) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function hasUpperCaseCharacterInMiddleOfWord(text) {
|
||||
var beginningOfWord = true;
|
||||
for (var i = 0; i < text.length; i++) {
|
||||
|
@ -1,6 +1,7 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
||||
|
||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||
@ -56,10 +57,10 @@ export default class AppState {
|
||||
new CompactLines(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
new PostprocessLines(),
|
||||
// new PostprocessLines(),
|
||||
new DetectTOC(),
|
||||
new DetectHeaders(),
|
||||
new CompleteFormats(),
|
||||
// new CompleteFormats(),
|
||||
new DetectListItems(),
|
||||
|
||||
new GatherBlocks(),
|
||||
|
@ -1,83 +1,85 @@
|
||||
import { Enum } from 'enumify';
|
||||
import TextItem from './TextItem.jsx';
|
||||
import TextItemBlock from './TextItemBlock.jsx';
|
||||
import LineItem from './LineItem.jsx';
|
||||
import LineItemBlock from './LineItemBlock.jsx';
|
||||
|
||||
// An Markdown element
|
||||
export default class ElementType extends Enum {
|
||||
}
|
||||
|
||||
//TODO rename to BlockType
|
||||
|
||||
ElementType.initEnum({
|
||||
H1: {
|
||||
headline: true,
|
||||
headlineLevel: 1,
|
||||
toText(block:TextItemBlock) {
|
||||
return '# ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '# ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
H2: {
|
||||
headline: true,
|
||||
headlineLevel: 2,
|
||||
toText(block:TextItemBlock) {
|
||||
return '## ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '## ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
H3: {
|
||||
headline: true,
|
||||
headlineLevel: 3,
|
||||
toText(block:TextItemBlock) {
|
||||
return '### ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '### ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
H4: {
|
||||
headline: true,
|
||||
headlineLevel: 4,
|
||||
toText(block:TextItemBlock) {
|
||||
return '#### ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '#### ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
H5: {
|
||||
headline: true,
|
||||
headlineLevel: 5,
|
||||
toText(block:TextItemBlock) {
|
||||
return '##### ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '##### ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
H6: {
|
||||
headline: true,
|
||||
headlineLevel: 6,
|
||||
toText(block:TextItemBlock) {
|
||||
return '###### ' + concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return '###### ' + concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
TOC: {
|
||||
mergeToBlock: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
FOOTNOTES: {
|
||||
mergeToBlock: true,
|
||||
mergeFollowingNonTypedItems: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
CODE: {
|
||||
mergeToBlock: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
toText(block:LineItemBlock) {
|
||||
return '```\n' + concatLineItems(block.items) + '```'
|
||||
}
|
||||
},
|
||||
LIST: {
|
||||
mergeToBlock: true,
|
||||
mergeFollowingNonTypedItemsWithSmallDistance: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return concatLineItems(block.items);
|
||||
}
|
||||
},
|
||||
PARAGRAPH: {
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
toText(block:LineItemBlock) {
|
||||
return concatLineItems(block.items);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -86,17 +88,17 @@ export function isHeadline(elementType: ElementType) {
|
||||
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
||||
}
|
||||
|
||||
export function blockToText(block: TextItemBlock) {
|
||||
export function blockToText(block: LineItemBlock) {
|
||||
if (!block.type) {
|
||||
return concatTextItems(block.textItems);
|
||||
return concatLineItems(block.items);
|
||||
}
|
||||
return block.type.toText(block);
|
||||
}
|
||||
|
||||
function concatTextItems(textItems: TextItem[]) {
|
||||
function concatLineItems(lineItems: LineItem[]) {
|
||||
var text = '';
|
||||
textItems.forEach(item => {
|
||||
text += item.text + '\n';
|
||||
lineItems.forEach(item => {
|
||||
text += item.text() + '\n';
|
||||
});
|
||||
return text;
|
||||
}
|
||||
|
@ -4,24 +4,25 @@ export default class HeadlineFinder {
|
||||
|
||||
constructor(options) {
|
||||
this.headlineCharCodes = normalizedCharCodeArray(options.headline);
|
||||
this.stackedTextItems = [];
|
||||
this.stackedLineItems = [];
|
||||
this.stackedChars = 0;
|
||||
}
|
||||
|
||||
consume(textItem) {
|
||||
const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
|
||||
consume(lineItem) {
|
||||
//TODO avoid join
|
||||
const normalizedCharCodes = normalizedCharCodeArray(lineItem.text());
|
||||
const matchAll = this.matchAll(normalizedCharCodes);
|
||||
if (matchAll) {
|
||||
this.stackedTextItems.push(textItem);
|
||||
this.stackedLineItems.push(lineItem);
|
||||
this.stackedChars += normalizedCharCodes.length;
|
||||
if (this.stackedChars == this.headlineCharCodes.length) {
|
||||
return this.stackedTextItems;
|
||||
return this.stackedLineItems;
|
||||
}
|
||||
} else {
|
||||
if (this.stackedChars > 0) {
|
||||
this.stackedChars = 0;
|
||||
this.stackedTextItems = [];
|
||||
this.consume(textItem); // test again without stack
|
||||
this.stackedLineItems = [];
|
||||
this.consume(lineItem); // test again without stack
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
145
src/javascript/models/LineConverter.jsx
Normal file
145
src/javascript/models/LineConverter.jsx
Normal file
@ -0,0 +1,145 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import Word from './Word.jsx';
|
||||
import WordType from './markdown/WordType.jsx';
|
||||
import LineItem from './LineItem.jsx';
|
||||
import StashingStream from './StashingStream.jsx';
|
||||
import { ParsedElements } from './PageItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { sortByX } from '../pageItemFunctions.jsx'
|
||||
|
||||
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class LineConverter {
|
||||
|
||||
constructor(fontToFormats) {
|
||||
this.fontToFormats = fontToFormats;
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
compact(textItems: TextItem[]) {
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(textItems);
|
||||
|
||||
const wordStream = new WordDetectionStream(this.fontToFormats);
|
||||
wordStream.consumeAll(textItems.map(item => new TextItem({
|
||||
...item
|
||||
})));
|
||||
const words = wordStream.complete();
|
||||
|
||||
var maxHeight = 0;
|
||||
var widthSum = 0;
|
||||
textItems.forEach(item => {
|
||||
maxHeight = Math.max(maxHeight, item.height);
|
||||
widthSum += item.width;
|
||||
});
|
||||
return new LineItem({
|
||||
x: textItems[0].x,
|
||||
y: textItems[0].y,
|
||||
height: maxHeight,
|
||||
width: widthSum,
|
||||
words: words,
|
||||
parsedElements: new ParsedElements({
|
||||
footnoteLinks: wordStream.footnoteLinks,
|
||||
footnotes: wordStream.footnotes
|
||||
})
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function itemsToWords(items, format) {
|
||||
const combinedText = combineText(items);
|
||||
// const combinedText = items.map(textItem => textItem.text).join('');
|
||||
const words = combinedText.split(' ');
|
||||
return words.filter(w => w.trim().length > 0).map(word => {
|
||||
return new Word({
|
||||
string: word,
|
||||
type: format
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function combineText(textItems) {
|
||||
var text = '';
|
||||
var lastItem;
|
||||
textItems.forEach(textItem => {
|
||||
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
|
||||
const xDistance = textItem.x - lastItem.x - lastItem.width;
|
||||
if (xDistance > 5) {
|
||||
text += ' ';
|
||||
}
|
||||
}
|
||||
text += textItem.text;
|
||||
lastItem = textItem;
|
||||
});
|
||||
return text;
|
||||
}
|
||||
|
||||
class WordDetectionStream extends StashingStream {
|
||||
|
||||
constructor(fontToFormats) {
|
||||
super();
|
||||
this.fontToFormats = fontToFormats;
|
||||
this.footnoteLinks = [];
|
||||
this.footnotes = [];
|
||||
|
||||
this.firstY;
|
||||
this.stashedNumber = false;
|
||||
this.currentItem;
|
||||
}
|
||||
|
||||
shouldStash(item) { // eslint-disable-line no-unused-vars
|
||||
if (!this.firstY) {
|
||||
this.firstY = item.y;
|
||||
}
|
||||
this.currentItem = item;
|
||||
return true;
|
||||
}
|
||||
|
||||
onPushOnStash(item) { // eslint-disable-line no-unused-vars
|
||||
this.stashedNumber = isNumber(item.text.trim());
|
||||
}
|
||||
|
||||
doMatchesStash(lastItem, item) {
|
||||
const lastItemFormat = this.fontToFormats.get(lastItem.font);
|
||||
const itemFormat = this.fontToFormats.get(item.font);
|
||||
if (lastItemFormat !== itemFormat) {
|
||||
return false;
|
||||
}
|
||||
const itemIsANumber = isNumber(item.text.trim());
|
||||
return this.stashedNumber == itemIsANumber;
|
||||
}
|
||||
|
||||
doFlushStash(stash, results) {
|
||||
if (this.stashedNumber) {
|
||||
const joinedNumber = stash.map(item => item.text).join('');
|
||||
if (stash[0].y > this.firstY) { // footnote link
|
||||
results.push(new Word({
|
||||
string: `${joinedNumber}`,
|
||||
type: WordType.FOOTNOTE_LINK
|
||||
//TODO format to
|
||||
//^
|
||||
//`<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||
}));
|
||||
this.footnoteLinks.push(parseInt(joinedNumber));
|
||||
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
|
||||
results.push(new Word({
|
||||
string: `${joinedNumber}`,
|
||||
type: WordType.FOOTNOTE
|
||||
//TODO format to (^${ joinedNumber}):
|
||||
}));
|
||||
this.footnotes.push(joinedNumber);
|
||||
} else {
|
||||
this.copyStashItemsAsText(stash, results);
|
||||
}
|
||||
} else {
|
||||
this.copyStashItemsAsText(stash, results);
|
||||
}
|
||||
}
|
||||
|
||||
copyStashItemsAsText(stash, results) {
|
||||
const format = this.fontToFormats.get(stash[0].font);
|
||||
results.push(...itemsToWords(stash, format));
|
||||
}
|
||||
}
|
29
src/javascript/models/LineItem.jsx
Normal file
29
src/javascript/models/LineItem.jsx
Normal file
@ -0,0 +1,29 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
import Word from './Word.jsx'
|
||||
|
||||
//A line within a page
|
||||
export default class LineItem extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.x = options.x;
|
||||
this.y = options.y;
|
||||
this.width = options.width;
|
||||
this.height = options.height;
|
||||
this.words = options.words || [];
|
||||
if (options.text && !options.words) {
|
||||
this.words = options.text.split(" ").filter(string => string.trim().length > 0).map(wordAsString => new Word({
|
||||
string: wordAsString
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
text() {
|
||||
return this.wordStrings().join(" ");
|
||||
}
|
||||
|
||||
wordStrings() {
|
||||
return this.words.map(word => word.string);
|
||||
}
|
||||
|
||||
}
|
36
src/javascript/models/LineItemBlock.jsx
Normal file
36
src/javascript/models/LineItemBlock.jsx
Normal file
@ -0,0 +1,36 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
import LineItem from './LineItem.jsx'
|
||||
|
||||
// A block of LineItem[] within a Page
|
||||
export default class LineItemBlock extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.items = [];
|
||||
if (options.items) {
|
||||
options.items.forEach(item => this.addItem(item));
|
||||
}
|
||||
}
|
||||
|
||||
addItem(item:LineItem) {
|
||||
if (this.type && item.type && this.type !== item.type) {
|
||||
throw `Adding item of type ${item.type} to block of type ${this.type}`
|
||||
}
|
||||
if (!this.type) {
|
||||
this.type = item.type;
|
||||
}
|
||||
if (item.parsedElements) {
|
||||
if (this.parsedElements) {
|
||||
this.parsedElements.add(item.parsedElements);
|
||||
} else {
|
||||
this.parsedElements = item.parsedElements;
|
||||
}
|
||||
}
|
||||
const copiedItem = new LineItem({
|
||||
...item
|
||||
});
|
||||
copiedItem.type = null;
|
||||
this.items.push(copiedItem);
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
// A abstract PageItem class, can be TextItem, or TextItemBlock
|
||||
// A abstract PageItem class, can be TextItem, LineItem or LineItemBlock
|
||||
export default class PageItem {
|
||||
|
||||
constructor(options) {
|
||||
|
73
src/javascript/models/StashingStream.jsx
Normal file
73
src/javascript/models/StashingStream.jsx
Normal file
@ -0,0 +1,73 @@
|
||||
//Abstract stream which allows stash items temporarily
|
||||
export default class StashingStream {
|
||||
|
||||
constructor() {
|
||||
if (this.constructor === StashingStream) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.results = [];
|
||||
this.stash = [];
|
||||
}
|
||||
|
||||
consumeAll(items) {
|
||||
items.forEach(item => this.consume(item));
|
||||
}
|
||||
|
||||
consume(item) {
|
||||
if (this.shouldStash(item)) {
|
||||
if (!this.matchesStash(item)) {
|
||||
this.flushStash();
|
||||
}
|
||||
this.pushOnStash(item);
|
||||
} else {
|
||||
if (this.stash.length > 0) {
|
||||
this.flushStash();
|
||||
}
|
||||
this.results.push(item);
|
||||
}
|
||||
}
|
||||
|
||||
pushOnStash(item) {
|
||||
this.onPushOnStash(item);
|
||||
this.stash.push(item);
|
||||
}
|
||||
|
||||
complete() {
|
||||
if (this.stash.length > 0) {
|
||||
this.flushStash();
|
||||
}
|
||||
return this.results;
|
||||
}
|
||||
|
||||
// return true if the item matches the items of the stack
|
||||
matchesStash(item) {
|
||||
if (this.stash.length == 0) {
|
||||
return true;
|
||||
}
|
||||
const lastItem = this.stash[this.stash.length - 1];
|
||||
return this.doMatchesStash(lastItem, item);
|
||||
}
|
||||
|
||||
flushStash() {
|
||||
if (this.stash.length > 0) {
|
||||
this.doFlushStash(this.stash, this.results);
|
||||
this.stash = [];
|
||||
}
|
||||
}
|
||||
|
||||
onPushOnStash(item) { // eslint-disable-line no-unused-vars
|
||||
//sub-classes may override
|
||||
}
|
||||
|
||||
shouldStash(item) {
|
||||
throw new TypeError("Do not call abstract method foo from child." + item);
|
||||
}
|
||||
|
||||
doMatchesStash(lastItem, item) {
|
||||
throw new TypeError("Do not call abstract method foo from child." + lastItem + item);
|
||||
}
|
||||
|
||||
doFlushStash(stash, results) {
|
||||
throw new TypeError("Do not call abstract method foo from child." + stash + results);
|
||||
}
|
||||
}
|
@ -11,8 +11,6 @@ export default class TextItem extends PageItem {
|
||||
this.height = options.height;
|
||||
this.text = options.text;
|
||||
this.font = options.font;
|
||||
this.fontAscent = options.fontAscent;
|
||||
this.fontDescent = options.fontDescent;
|
||||
|
||||
this.lineFormat = options.lineFormat;
|
||||
this.unopenedFormat = options.unopenedFormat;
|
||||
|
@ -1,36 +0,0 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
import TextItem from './TextItem.jsx'
|
||||
|
||||
// A block of TextItem[] within a Page
|
||||
export default class TextItemBlock extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.textItems = [];
|
||||
if (options.textItems) {
|
||||
options.textItems.forEach(item => this.addTextItem(item));
|
||||
}
|
||||
}
|
||||
|
||||
addTextItem(textItem:TextItem) {
|
||||
if (this.type && textItem.type && this.type !== textItem.type) {
|
||||
throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
|
||||
}
|
||||
if (!this.type) {
|
||||
this.type = textItem.type;
|
||||
}
|
||||
if (textItem.parsedElements) {
|
||||
if (this.parsedElements) {
|
||||
this.parsedElements.add(textItem.parsedElements);
|
||||
} else {
|
||||
this.parsedElements = textItem.parsedElements;
|
||||
}
|
||||
}
|
||||
const copiedTextItem = new TextItem({
|
||||
...textItem
|
||||
});
|
||||
copiedTextItem.type = null;
|
||||
this.textItems.push(copiedTextItem);
|
||||
}
|
||||
|
||||
}
|
@ -1,227 +0,0 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { ParsedElements } from './PageItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
|
||||
|
||||
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemLineCompactor {
|
||||
|
||||
constructor(fontToFormats) {
|
||||
this.fontToFormats = fontToFormats;
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
compact(lineItems: TextItem[]) {
|
||||
if (lineItems.length < 2) {
|
||||
throw "Must be at least 2 line items, but was " + lineItems;
|
||||
}
|
||||
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
|
||||
const formatter = new Formatter(this.fontToFormats);
|
||||
var [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
|
||||
resolvedLineItems.forEach(item => formatter.consume(item));
|
||||
resolvedLineItems = formatter.getResults();
|
||||
parsedElements.inlineFormats = formatter.inlineFormats;
|
||||
// const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
|
||||
|
||||
var combinedItem;
|
||||
if (resolvedLineItems.length == 1) {
|
||||
combinedItem = resolvedLineItems[0];
|
||||
} else {
|
||||
var text = '';
|
||||
var maxHeight = 0;
|
||||
var widthSum = 0;
|
||||
var lastItem;
|
||||
resolvedLineItems.forEach(item => {
|
||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||
if (xDistance >= 5) {
|
||||
text += ' ';
|
||||
}
|
||||
}
|
||||
text += item.text;
|
||||
widthSum += item.width;
|
||||
lastItem = item;
|
||||
maxHeight = Math.max(maxHeight, item.height);
|
||||
});
|
||||
combinedItem = new TextItem({
|
||||
...resolvedLineItems[0],
|
||||
text: text,
|
||||
height: maxHeight,
|
||||
width: widthSum
|
||||
});
|
||||
}
|
||||
combinedItem.parsedElements = parsedElements;
|
||||
combinedItem.lineFormat = formatter.lineFormat;
|
||||
combinedItem.unopenedFormat = formatter.unopenedFormat;
|
||||
combinedItem.unclosedFormat = formatter.unclosedFormat;
|
||||
return combinedItem;
|
||||
}
|
||||
|
||||
|
||||
resolveSpecialElements(lineItems) {
|
||||
const footnoteLinks = [];
|
||||
const footnotes = [];
|
||||
const basicY = lineItems[0].y;
|
||||
const newLineItems = [];
|
||||
var stashedNumberItems = [];
|
||||
|
||||
const commitStashedNumbers = (nextItem) => {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||
text: `^${joinedNumber}`
|
||||
}));
|
||||
footnoteLinks.push(parseInt(joinedNumber));
|
||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||
//TODO womb comp [29] => ydiff == 0
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
text: `(^${ joinedNumber}): `
|
||||
}));
|
||||
footnotes.push(joinedNumber);
|
||||
} else {
|
||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||
}
|
||||
|
||||
stashedNumberItems = [];
|
||||
}
|
||||
};
|
||||
|
||||
lineItems.forEach(item => {
|
||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||
// skip whitespace on the beginning of a line
|
||||
} else {
|
||||
const isANumber = isNumber(item.text.trim());
|
||||
if (isANumber) {
|
||||
stashedNumberItems.push(item);
|
||||
} else {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
commitStashedNumbers(item);
|
||||
}
|
||||
newLineItems.push(item);
|
||||
}
|
||||
}
|
||||
});
|
||||
commitStashedNumbers();
|
||||
|
||||
|
||||
return [newLineItems, new ParsedElements({
|
||||
footnoteLinks: footnoteLinks,
|
||||
footnotes: footnotes
|
||||
})];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class Formatter {
|
||||
|
||||
constructor(fontToFormats) {
|
||||
this.fontToFormats = fontToFormats;
|
||||
|
||||
this.resultItems = [];
|
||||
this.lineFormat;
|
||||
this.unopenedFormat;
|
||||
this.unclosedFormat;
|
||||
|
||||
this.openFormat;
|
||||
this.stashedItems = [];
|
||||
this.inlineFormats = 0;
|
||||
this.lastItem;
|
||||
}
|
||||
|
||||
|
||||
consume(item) {
|
||||
const formatType = this.fontToFormats.get(item.font);
|
||||
if (this.openFormat && formatType !== this.openFormat) {
|
||||
this.flushStash(false);
|
||||
}
|
||||
if (formatType.needFormat) {
|
||||
this.openFormat = formatType;
|
||||
this.stashedItems.push(item);
|
||||
} else {
|
||||
this.resultItems.push(item);
|
||||
}
|
||||
}
|
||||
|
||||
getResults() {
|
||||
if (this.openFormat) {
|
||||
this.flushStash(true);
|
||||
}
|
||||
return this.resultItems;
|
||||
}
|
||||
|
||||
flushStash(formatToEndOfLine) {
|
||||
const formatFromBeginningOfLine = this.resultItems == 0;
|
||||
if (formatFromBeginningOfLine) {
|
||||
if (formatToEndOfLine) {
|
||||
this.lineFormat = this.openFormat;
|
||||
this.moveStashItemsToResult();
|
||||
} else {
|
||||
this.unopenedFormat = this.openFormat;
|
||||
const newLastItem = this.newClosingItem(this.stashedItems.pop());
|
||||
this.moveStashItemsToResult();
|
||||
this.resultItems.push(newLastItem);
|
||||
}
|
||||
} else {
|
||||
if (formatToEndOfLine) {
|
||||
this.unclosedFormat = this.openFormat;
|
||||
const newFirstItem = this.newOpeningItem(this.stashedItems.shift());
|
||||
this.resultItems.push(newFirstItem);
|
||||
this.moveStashItemsToResult();
|
||||
} else {
|
||||
this.inlineFormats++;
|
||||
if (this.stashedItems.length == 1) {
|
||||
const onlyItem = this.stashedItems.pop();
|
||||
if (onlyItem.text.trim().length > 0) {
|
||||
const onlyItemFormatted = this.newCompleteItem(onlyItem);
|
||||
this.resultItems.push(onlyItemFormatted);
|
||||
}
|
||||
this.moveStashItemsToResult();
|
||||
} else {
|
||||
const firstItem = this.newOpeningItem(this.stashedItems.shift());
|
||||
const lastItem = this.newClosingItem(this.stashedItems.pop());
|
||||
this.resultItems.push(firstItem);
|
||||
this.moveStashItemsToResult();
|
||||
this.resultItems.push(lastItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
moveStashItemsToResult() {
|
||||
this.resultItems.push(...this.stashedItems);
|
||||
this.stashedItems = [];
|
||||
this.openFormat = null;
|
||||
}
|
||||
|
||||
newOpeningItem(item) {
|
||||
return new TextItem({
|
||||
...item,
|
||||
text: prefixAfterWhitespace(this.openFormat.startSymbol, item.text)
|
||||
});
|
||||
}
|
||||
|
||||
newClosingItem(item) {
|
||||
return new TextItem({
|
||||
...item,
|
||||
text: suffixBeforeWhitespace(item.text, this.openFormat.endSymbol)
|
||||
});
|
||||
}
|
||||
|
||||
newCompleteItem(item) {
|
||||
return new TextItem({
|
||||
...item,
|
||||
text: suffixBeforeWhitespace(prefixAfterWhitespace(this.openFormat.startSymbol, item.text), this.openFormat.endSymbol)
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
import { sortByX } from '../pageItemFunctions.jsx'
|
||||
|
||||
//Groups all text items which are on the same y line
|
||||
export default class TextItemLineGrouper {
|
||||
|
8
src/javascript/models/Word.jsx
Normal file
8
src/javascript/models/Word.jsx
Normal file
@ -0,0 +1,8 @@
|
||||
export default class Word {
|
||||
|
||||
constructor(options) {
|
||||
this.string = options.string;
|
||||
this.type = options.type; // WordType
|
||||
}
|
||||
|
||||
}
|
7
src/javascript/models/markdown/WordType.jsx
Normal file
7
src/javascript/models/markdown/WordType.jsx
Normal file
@ -0,0 +1,7 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
// An Markdown word element
|
||||
export default class WordType extends Enum {
|
||||
}
|
||||
|
||||
WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']);
|
@ -1,16 +1,16 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
|
||||
import LineItemBlock from '../LineItemBlock.jsx';
|
||||
import LineItemBlockPageView from '../../components/debug/LineItemBlockPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
|
||||
export default class ToTextItemBlockTransformation extends Transformation {
|
||||
// Abstract class for transformations producing LineItemBlock(s) to be shown in the LineItemBlockPageView
|
||||
export default class ToLineItemBlockTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name, TextItemBlock.name);
|
||||
if (this.constructor === ToTextItemBlockTransformation) {
|
||||
super(name, LineItemBlock.name);
|
||||
if (this.constructor === ToLineItemBlockTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
@ -25,7 +25,7 @@ export default class ToTextItemBlockTransformation extends Transformation {
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <TextItemBlockPageView
|
||||
return <LineItemBlockPageView
|
||||
key={ page.index }
|
||||
page={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
@ -0,0 +1,46 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import LineItem from '../LineItem.jsx';
|
||||
import LineItemPageView from '../../components/debug/LineItemPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing LineItem(s) to be shown in the LineItemPageView
|
||||
export default class ToLineItemTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name, LineItem.name);
|
||||
if (this.constructor === ToLineItemTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <LineItemPageView
|
||||
key={ page.index }
|
||||
page={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||
page.items.forEach(item => item.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import StringFormat from '../../StringFormat.jsx';
|
||||
import WordType from '../../markdown/WordType.jsx';
|
||||
// import StringFormat from '../../StringFormat.jsx';
|
||||
|
||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
|
||||
@ -54,21 +55,21 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
this.fontMap.forEach(function(value, key) {
|
||||
fontIdToName.push(key + " = " + value.name)
|
||||
const fontName = value.name.toLowerCase();
|
||||
var format;
|
||||
var type;
|
||||
if (key == mostUsedFont) {
|
||||
format = StringFormat.STANDARD;
|
||||
type = null;
|
||||
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||
format = StringFormat.BOLD_OBLIQUE;
|
||||
type = WordType.BOLD_OBLIQUE;
|
||||
} else if (fontName.includes('bold')) {
|
||||
format = StringFormat.BOLD;
|
||||
type = WordType.BOLD;
|
||||
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||
format = StringFormat.OBLIQUE;
|
||||
type = WordType.OBLIQUE;
|
||||
} else if (fontName === maxHeightFont) {
|
||||
format = StringFormat.BOLD;
|
||||
} else {
|
||||
format = StringFormat.STANDARD;
|
||||
type = WordType.BOLD;
|
||||
}
|
||||
if (type) {
|
||||
fontToFormats.set(key, type);
|
||||
}
|
||||
fontToFormats.set(key, format);
|
||||
});
|
||||
fontIdToName.sort();
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
import React from 'react';
|
||||
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { ParsedElements } from '../../PageItem.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
||||
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
|
||||
import LineConverter from '../../LineConverter.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
|
||||
// gathers text items on the same y line to one text item
|
||||
export default class CompactLines extends ToTextItemTransformation {
|
||||
// gathers text items on the same y line to one line item
|
||||
export default class CompactLines extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Compact To Lines");
|
||||
@ -20,39 +20,32 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
const {mostUsedDistance, fontToFormats} = parseResult.globals;
|
||||
const foundFootnotes = [];
|
||||
const foundFootnoteLinks = [];
|
||||
var inlineFormats = 0;
|
||||
var lineFormats = 0;
|
||||
var unopenedFormats = 0;
|
||||
var unclosedFormats = 0;
|
||||
var formattedWords = 0;
|
||||
|
||||
const lineGrouper = new TextItemLineGrouper({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
const lineCompactor = new TextItemLineCompactor(fontToFormats);
|
||||
const lineCompactor = new LineConverter(fontToFormats);
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
if (page.items.length > 0) {
|
||||
const newItems = [];
|
||||
const lineItems = [];
|
||||
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
||||
var lineItem;
|
||||
if (textItemsOfLine.length == 1) {
|
||||
lineItem = textItemsOfLine[0];
|
||||
const formatType = fontToFormats.get(lineItem.font);
|
||||
if (formatType.needFormat) {
|
||||
lineItem.lineFormat = formatType;
|
||||
lineItem.parsedElements = new ParsedElements({
|
||||
completeLineFormats: 1
|
||||
textItemsGroupedByLine.forEach(lineTextItems => {
|
||||
const lineItem = lineCompactor.compact(lineTextItems);
|
||||
if (lineTextItems.length > 1) {
|
||||
lineItem.annotation = ADDED_ANNOTATION;
|
||||
lineTextItems.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
lineItems.push(new LineItem({
|
||||
...item
|
||||
}));
|
||||
});
|
||||
}
|
||||
} else {
|
||||
textItemsOfLine.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItems.push(item);
|
||||
});
|
||||
|
||||
lineItem = lineCompactor.compact(textItemsOfLine);
|
||||
lineItem.annotation = ADDED_ANNOTATION;
|
||||
if (lineItem.words.length == 0) {
|
||||
lineItem.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
lineItems.push(lineItem);
|
||||
|
||||
if (lineItem.parsedElements.footnoteLinks.length > 0) {
|
||||
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||
@ -63,15 +56,8 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
inlineFormats += lineItem.parsedElements.inlineFormats;
|
||||
}
|
||||
if (lineItem.lineFormat) lineFormats++;
|
||||
if (lineItem.unopenedFormat) unopenedFormats++;
|
||||
if (lineItem.unclosedFormat) unclosedFormats++;
|
||||
lineItem.text = lineItem.text.trim();
|
||||
newItems.push(lineItem);
|
||||
});
|
||||
page.items = newItems;
|
||||
page.items = lineItems;
|
||||
}
|
||||
});
|
||||
|
||||
@ -79,11 +65,8 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Detected ' + lineFormats + ' line formats',
|
||||
'Detected ' + inlineFormats + ' inline formats',
|
||||
'Detected ' + unclosedFormats + ' opened un-closed formats',
|
||||
'Detected ' + unopenedFormats + ' un-opened closed formats',
|
||||
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
|
||||
'Detected ' + formattedWords + ' formatted words',
|
||||
<span>Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }]</span>,
|
||||
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
||||
]
|
||||
});
|
||||
|
@ -6,6 +6,8 @@ import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../.
|
||||
//Complete unopened/unclosed bold/italic formats
|
||||
export default class CompleteFormats extends ToTextItemTransformation {
|
||||
|
||||
//TODO move to block and ignore quotes
|
||||
|
||||
constructor() {
|
||||
super("Complete Bold/Italics");
|
||||
}
|
||||
@ -81,7 +83,6 @@ class ItemStack {
|
||||
}
|
||||
|
||||
consume(item) {
|
||||
const te = item.text;
|
||||
var newItem;
|
||||
|
||||
const handleFreshUnopened = () => {
|
||||
|
@ -1,4 +1,4 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
@ -6,7 +6,7 @@ import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isListItem } from '../../../functions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectHeaders extends ToTextItemTransformation {
|
||||
export default class DetectHeaders extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headers");
|
||||
@ -21,15 +21,15 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||
pagesWithMaxHeight.forEach(titlePage => {
|
||||
titlePage.items.forEach(textItem => {
|
||||
const height = textItem.height;
|
||||
if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
titlePage.items.forEach(item => {
|
||||
const height = item.height;
|
||||
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
if (height == maxHeight) {
|
||||
textItem.type = ElementType.H1;
|
||||
item.type = ElementType.H1;
|
||||
} else {
|
||||
textItem.type = ElementType.H2;
|
||||
item.type = ElementType.H2;
|
||||
}
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
detectedHeaders++;
|
||||
}
|
||||
});
|
||||
@ -41,10 +41,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == range.max) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.enumValueOf(headlineType);
|
||||
page.items.forEach(item => {
|
||||
if (!item.type && item.height == range.max) {
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.enumValueOf(headlineType);
|
||||
detectedHeaders++
|
||||
}
|
||||
});
|
||||
@ -56,10 +56,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
const heights = [];
|
||||
var lastHeight;
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
|
||||
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
|
||||
heights.push(textItem.height);
|
||||
page.items.forEach(item => {
|
||||
if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
|
||||
if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
|
||||
heights.push(item.height);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -69,11 +69,11 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
heights.forEach((height, i) => {
|
||||
const headlineType = headlineByLevel(2 + i);
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
|
||||
page.items.forEach(item => {
|
||||
if (!item.type && item.height == height && !isListItem(item.text())) {
|
||||
detectedHeaders++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = headlineType;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = headlineType;
|
||||
}
|
||||
});
|
||||
});
|
||||
@ -83,9 +83,9 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
//find headlines which have paragraph height
|
||||
var smallesHeadlineLevel = 1;
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (textItem.type && textItem.type.headline) {
|
||||
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
|
||||
page.items.forEach(item => {
|
||||
if (item.type && item.type.headline) {
|
||||
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
|
||||
}
|
||||
});
|
||||
});
|
||||
@ -93,18 +93,18 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
|
||||
parseResult.pages.forEach(page => {
|
||||
var lastItem;
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type
|
||||
&& textItem.height == mostUsedHeight
|
||||
&& textItem.font !== mostUsedFont
|
||||
&& (!lastItem || lastItem.y < textItem.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - textItem.y > mostUsedDistance * 2))
|
||||
&& textItem.text === textItem.text.toUpperCase()
|
||||
page.items.forEach(item => {
|
||||
if (!item.type
|
||||
&& item.height == mostUsedHeight
|
||||
&& item.font !== mostUsedFont
|
||||
&& (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
|
||||
&& item.text() === item.text().toUpperCase()
|
||||
) {
|
||||
detectedHeaders++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = nextHeadlineType;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = nextHeadlineType;
|
||||
}
|
||||
lastItem = textItem;
|
||||
lastItem = item;
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -124,8 +124,8 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
function findPagesWithMaxHeight(pages, maxHeight) {
|
||||
const maxHeaderPagesSet = new Set();
|
||||
pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == maxHeight) {
|
||||
page.items.forEach(item => {
|
||||
if (!item.type && item.height == maxHeight) {
|
||||
maxHeaderPagesSet.add(page);
|
||||
}
|
||||
});
|
||||
|
@ -1,12 +1,12 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectListItems extends ToTextItemTransformation {
|
||||
export default class DetectListItems extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect List Items");
|
||||
@ -16,34 +16,34 @@ export default class DetectListItems extends ToTextItemTransformation {
|
||||
var foundListItems = 0;
|
||||
var foundNumberedItems = 0;
|
||||
parseResult.pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.items.forEach(textItem => {
|
||||
newTextItems.push(textItem);
|
||||
if (!textItem.type) {
|
||||
var text = textItem.text;
|
||||
const newItems = [];
|
||||
page.items.forEach(item => {
|
||||
newItems.push(item);
|
||||
if (!item.type) {
|
||||
var text = item.text();
|
||||
if (isListItem(text)) {
|
||||
foundListItems++
|
||||
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||
if (textWithDash === text) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.LIST;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.LIST;
|
||||
} else {
|
||||
textItem.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(new TextItem({
|
||||
...textItem,
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItems.push(new LineItem({
|
||||
...item,
|
||||
text: textWithDash,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
type: ElementType.LIST
|
||||
}));
|
||||
}
|
||||
} else if (isNumberedListItem(text)) {
|
||||
} else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
|
||||
foundNumberedItems++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.LIST;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.LIST;
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newTextItems;
|
||||
page.items = newItems;
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
|
@ -1,14 +1,15 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import Word from '../../Word.jsx';
|
||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isDigit, wordMatch } from '../../../functions.jsx'
|
||||
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx'
|
||||
|
||||
//Detect table of contents pages
|
||||
export default class DetectTOC extends ToTextItemTransformation {
|
||||
//Detect table of contents pages plus linked headlines
|
||||
export default class DetectTOC extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect TOC");
|
||||
@ -17,64 +18,68 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
transform(parseResult:ParseResult) {
|
||||
const tocPages = [];
|
||||
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
||||
|
||||
const linkLeveler = new LinkLeveler();
|
||||
|
||||
|
||||
var tocLinks = [];
|
||||
var lastTocPage;
|
||||
var headlineItem;
|
||||
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||
const lineItemsWithDigits = [];
|
||||
var lineItemsWithDigits = 0;
|
||||
const unknownLines = new Set();
|
||||
const pageTocLinks = [];
|
||||
var lastLineTextWithoutNumber;
|
||||
var lastWordsWithoutNumber;
|
||||
var lastLine;
|
||||
//find lines ending with a number per page
|
||||
page.items.forEach(line => {
|
||||
var lineText = line.text.replace(/\./g, '').trim();
|
||||
var endsWithDigit = false;
|
||||
var digits = [];
|
||||
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
||||
digits.unshift(lineText.charAt(lineText.length - 1));
|
||||
lineText = lineText.substring(0, lineText.length - 1);
|
||||
endsWithDigit = true;
|
||||
var words = line.words.filter(word => !hasOnly(word.string, '.'));
|
||||
const digits = [];
|
||||
while (words.length > 0 && isNumber(words[words.length - 1].string)) {
|
||||
const lastWord = words.pop();
|
||||
digits.unshift(lastWord.string);
|
||||
}
|
||||
lineText = lineText.trim();
|
||||
|
||||
if (digits.length == 0 && words.length > 0) {
|
||||
const lastWord = words[words.length - 1];
|
||||
while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
|
||||
digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
|
||||
lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1);
|
||||
}
|
||||
}
|
||||
var endsWithDigit = digits.length > 0;
|
||||
if (endsWithDigit) {
|
||||
endsWithDigit = true;
|
||||
if (lastLineTextWithoutNumber) { // 2-line item ?
|
||||
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
||||
lastLineTextWithoutNumber = null;
|
||||
if (lastWordsWithoutNumber) { // 2-line item ?
|
||||
words.push(...lastWordsWithoutNumber);
|
||||
lastWordsWithoutNumber = null;
|
||||
}
|
||||
pageTocLinks.push(new TocLink({
|
||||
pageNumber: parseInt(digits.join('')),
|
||||
textItem: new TextItem({
|
||||
lineItem: new LineItem({
|
||||
...line,
|
||||
text: lineText
|
||||
words: words
|
||||
})
|
||||
}));
|
||||
lineItemsWithDigits.push(new TextItem({
|
||||
...line,
|
||||
text: lineText
|
||||
}));
|
||||
lastLineTextWithoutNumber = null;
|
||||
lineItemsWithDigits++;
|
||||
} else {
|
||||
if (!headlineItem) {
|
||||
headlineItem = line;
|
||||
} else {
|
||||
if (lastLineTextWithoutNumber) {
|
||||
if (lastWordsWithoutNumber) {
|
||||
unknownLines.add(lastLine);
|
||||
}
|
||||
lastLineTextWithoutNumber = lineText;
|
||||
lastWordsWithoutNumber = words;
|
||||
lastLine = line;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// page has been processed
|
||||
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
|
||||
if (lineItemsWithDigits * 100 / page.items.length > 75) {
|
||||
tocPages.push(page.index + 1);
|
||||
lastTocPage = page;
|
||||
linkLeveler.levelPageItems(pageTocLinks);
|
||||
tocLinks = tocLinks.concat(pageTocLinks);
|
||||
tocLinks.push(...pageTocLinks);
|
||||
|
||||
const newBlocks = [];
|
||||
page.items.forEach((line) => {
|
||||
@ -83,7 +88,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
}
|
||||
newBlocks.push(line);
|
||||
if (line === headlineItem) {
|
||||
newBlocks.push(new TextItem({
|
||||
newBlocks.push(new LineItem({
|
||||
...line,
|
||||
type: ElementType.H2,
|
||||
annotation: ADDED_ANNOTATION
|
||||
@ -105,8 +110,10 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
if (tocPages.length > 0) {
|
||||
// Add TOC items
|
||||
tocLinks.forEach(tocLink => {
|
||||
lastTocPage.items.push(new TextItem({
|
||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||
lastTocPage.items.push(new LineItem({
|
||||
words: [new Word({
|
||||
string: ' '.repeat(tocLink.level * 3) + '-'
|
||||
})].concat(tocLink.lineItem.words),
|
||||
type: ElementType.TOC,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
@ -118,11 +125,11 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
|
||||
var foundHealineItems;
|
||||
if (linkedPage) {
|
||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
||||
if (!foundHealineItems) { // pages are off by 1 ?
|
||||
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
|
||||
if (linkedPage) {
|
||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -142,11 +149,16 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
||||
if (heightRange) {
|
||||
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||
if (textItem) {
|
||||
textItem.type = headlineType;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
foundBySize.push(textItem.text);
|
||||
const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||
if (lineIndex > -1) {
|
||||
const page = parseResult.pages[pageIndex];
|
||||
page.items[lineIndex].annotation = REMOVED_ANNOTATION;
|
||||
page.items.splice(lineIndex + 1, 0, new LineItem({
|
||||
...notFoundTocLink.lineItem,
|
||||
type: headlineType,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
}));
|
||||
foundBySize.push(notFoundTocLink);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -173,12 +185,12 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
const messages = [];
|
||||
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
||||
if (tocPages.length > 0) {
|
||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
||||
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
||||
}
|
||||
if (notFoundHeadlines.length > 0) {
|
||||
messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
||||
messages.push('Found TOC headlines (by size): ' + foundBySize);
|
||||
messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()));
|
||||
messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber));
|
||||
}
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
@ -196,7 +208,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
//Find out how the TOC page link actualy translates to the page.index
|
||||
function detectPageMappingNumber(pages, tocLinks) {
|
||||
for ( var tocLink of tocLinks ) {
|
||||
const page = findPageWithHeadline(pages, tocLink.textItem.text);
|
||||
const page = findPageWithHeadline(pages, tocLink.lineItem.text());
|
||||
if (page) {
|
||||
return page.index - tocLink.pageNumber;
|
||||
}
|
||||
@ -235,9 +247,9 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
||||
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||
const headlineType = headlineByLevel(tocLink.level + 2);
|
||||
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
||||
page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({
|
||||
page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
|
||||
...foundItems.headlineItems[0],
|
||||
text: tocLink.textItem.text,
|
||||
words: tocLink.lineItem.words,
|
||||
height: headlineHeight,
|
||||
type: headlineType,
|
||||
annotation: ADDED_ANNOTATION
|
||||
@ -255,20 +267,21 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
||||
}
|
||||
}
|
||||
|
||||
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
||||
function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) {
|
||||
const linkText = tocLink.lineItem.text().toUpperCase();
|
||||
for (var i = fromPage; i <= toPage; i++) {
|
||||
const page = pages[i - 1];
|
||||
for ( var line of page.items ) {
|
||||
const lineIndex = page.items.findIndex(line => {
|
||||
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
||||
const match = wordMatch(tocLink.textItem.text, line.text);
|
||||
if (match >= 0.5) {
|
||||
return line;
|
||||
const match = wordMatch(linkText, line.text());
|
||||
return match >= 0.5;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (lineIndex > -1) return [i - 1, lineIndex];
|
||||
}
|
||||
return [-1, -1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class LinkLeveler {
|
||||
constructor() {
|
||||
@ -297,13 +310,13 @@ class LinkLeveler {
|
||||
levelByXDiff(tocLinks) {
|
||||
const uniqueX = this.calculateUniqueX(tocLinks);
|
||||
tocLinks.forEach(link => {
|
||||
link.level = uniqueX.indexOf(link.textItem.x);
|
||||
link.level = uniqueX.indexOf(link.lineItem.x);
|
||||
});
|
||||
}
|
||||
|
||||
levelByFont(tocLinks) {
|
||||
tocLinks.forEach(link => {
|
||||
link.level = this.uniqueFonts.indexOf(link.textItem.font);
|
||||
link.level = this.uniqueFonts.indexOf(link.lineItem.font);
|
||||
});
|
||||
}
|
||||
|
||||
@ -315,7 +328,7 @@ class LinkLeveler {
|
||||
|
||||
calculateUniqueX(tocLinks) {
|
||||
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
||||
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
|
||||
if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x);
|
||||
return uniquesArray;
|
||||
}, []);
|
||||
|
||||
@ -328,7 +341,7 @@ class LinkLeveler {
|
||||
|
||||
calculateUniqueFonts(tocLinks) {
|
||||
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
||||
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
|
||||
if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font);
|
||||
return uniquesArray;
|
||||
}, []);
|
||||
|
||||
@ -339,7 +352,7 @@ class LinkLeveler {
|
||||
|
||||
class TocLink {
|
||||
constructor(options) {
|
||||
this.textItem = options.textItem;
|
||||
this.lineItem = options.lineItem;
|
||||
this.pageNumber = options.pageNumber;
|
||||
this.level = 0;
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
|
||||
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
export default class RemoveRepetitiveElements extends ToTextItemTransformation {
|
||||
export default class RemoveRepetitiveElements extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Repetitive Elements");
|
||||
@ -58,8 +58,8 @@ export default class RemoveRepetitiveElements extends ToTextItemTransformation {
|
||||
maxElements: []
|
||||
});
|
||||
|
||||
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
||||
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
||||
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
|
||||
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
|
||||
pageStore.push({
|
||||
minElements: minMaxItems.minElements,
|
||||
maxElements: minMaxItems.maxElements,
|
||||
|
@ -1,10 +1,11 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import StashingStream from '../../StashingStream.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
// Converts vertical text to horizontal
|
||||
export default class VerticalToHorizontal extends ToTextItemTransformation {
|
||||
export default class VerticalToHorizontal extends ToLineItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Vertical to Horizontal Text");
|
||||
@ -12,87 +13,64 @@ export default class VerticalToHorizontal extends ToTextItemTransformation {
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundVerticals = 0;
|
||||
const newPages = parseResult.pages.map(page => {
|
||||
const newTextItems = [];
|
||||
// var oneCharacterItems = [];
|
||||
|
||||
// const applyTransformation = () => {
|
||||
// oneCharacterItems.forEach(item => {
|
||||
// item.annotation = REMOVED_ANNOTATION;
|
||||
// newTextItems.push(item);
|
||||
// //TODO add new
|
||||
// });
|
||||
// oneCharacterItems = [];
|
||||
// };
|
||||
// const rollbackTransformation = () => {
|
||||
// oneCharacterItems.forEach(item => {
|
||||
// newTextItems.push(item);
|
||||
// });
|
||||
// oneCharacterItems = [];
|
||||
// };
|
||||
|
||||
//TODO generic state machine code ?
|
||||
|
||||
const leftOver = page.items.reduce((oneCharacterItems, item) => {
|
||||
if (item.text.trim().length == 1) {
|
||||
if (oneCharacterItems.length == 0) {
|
||||
oneCharacterItems.push(item);
|
||||
} else {
|
||||
const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
|
||||
if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
|
||||
oneCharacterItems.push(item);
|
||||
} else {
|
||||
if (oneCharacterItems.length > 5) {
|
||||
var combinedText = '';
|
||||
var minX = 999;
|
||||
var maxY = 0;
|
||||
var sumWidth = 0;
|
||||
var maxHeight = 0;
|
||||
oneCharacterItems.forEach(oneCharacterItem => {
|
||||
oneCharacterItem.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(oneCharacterItem);
|
||||
combinedText += oneCharacterItem.text.trim();
|
||||
minX = Math.min(minX, oneCharacterItem.x);
|
||||
maxY = Math.max(maxY, oneCharacterItem.y);
|
||||
sumWidth += oneCharacterItem.width;
|
||||
maxHeight = Math.max(maxHeight, oneCharacterItem.height);
|
||||
parseResult.pages.forEach(page => {
|
||||
const stream = new VerticalsStream();
|
||||
stream.consumeAll(page.items);
|
||||
page.items = stream.complete();
|
||||
foundVerticals += stream.foundVerticals;
|
||||
});
|
||||
newTextItems.push(new TextItem({
|
||||
...oneCharacterItems[0],
|
||||
x: minX,
|
||||
y: maxY,
|
||||
width: sumWidth,
|
||||
height: maxHeight,
|
||||
text: combinedText,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
foundVerticals++;
|
||||
} else {
|
||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
}
|
||||
oneCharacterItems = [item];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
oneCharacterItems = [];
|
||||
newTextItems.push(item);
|
||||
}
|
||||
return oneCharacterItems;
|
||||
}, []);
|
||||
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
|
||||
return {
|
||||
...page,
|
||||
items: newTextItems
|
||||
};
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
pages: newPages,
|
||||
messages: ["Converted " + foundVerticals + " verticals"]
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
class VerticalsStream extends StashingStream {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.foundVerticals = 0;
|
||||
}
|
||||
|
||||
shouldStash(item) {
|
||||
return item.words.length == 1 && item.words[0].string.length == 1;
|
||||
}
|
||||
|
||||
doMatchesStash(lastItem, item) {
|
||||
return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type;
|
||||
}
|
||||
|
||||
doFlushStash(stash, results) {
|
||||
if (stash.length > 5) { // unite
|
||||
var combinedWords = [];
|
||||
var minX = 999;
|
||||
var maxY = 0;
|
||||
var sumWidth = 0;
|
||||
var maxHeight = 0;
|
||||
stash.forEach(oneCharacterLine => {
|
||||
oneCharacterLine.annotation = REMOVED_ANNOTATION;
|
||||
results.push(oneCharacterLine);
|
||||
combinedWords.push(oneCharacterLine.words[0]);
|
||||
minX = Math.min(minX, oneCharacterLine.x);
|
||||
maxY = Math.max(maxY, oneCharacterLine.y);
|
||||
sumWidth += oneCharacterLine.width;
|
||||
maxHeight = Math.max(maxHeight, oneCharacterLine.height);
|
||||
});
|
||||
results.push(new LineItem({
|
||||
...stash[0],
|
||||
x: minX,
|
||||
y: maxY,
|
||||
width: sumWidth,
|
||||
height: maxHeight,
|
||||
words: combinedWords,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
this.foundVerticals++;
|
||||
} else { //add as singles
|
||||
results.push(...stash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
|
||||
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../../textItemFunctions.jsx';
|
||||
import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
|
||||
|
||||
//Detect items which are code/quote blocks
|
||||
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
|
||||
export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code/Quote Blocks");
|
||||
@ -17,7 +17,7 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
|
||||
if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
|
||||
block.annotation = DETECTED_ANNOTATION;
|
||||
block.type = ElementType.CODE;
|
||||
foundCodeItems++;
|
||||
@ -36,14 +36,14 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
|
||||
|
||||
}
|
||||
|
||||
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
|
||||
if (textItems.length == 0) {
|
||||
function looksLikeCodeBlock(minX, items, mostUsedHeight) {
|
||||
if (items.length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (textItems.length == 1) {
|
||||
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
|
||||
if (items.length == 1) {
|
||||
return items[0].x > minX && items[0].height <= mostUsedHeight + 1;
|
||||
}
|
||||
for ( var item of textItems ) {
|
||||
for ( var item of items ) {
|
||||
if (item.x == minX) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1,10 +1,11 @@
|
||||
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
|
||||
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import Word from '../../Word.jsx';
|
||||
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
|
||||
// Cares for proper sub-item spacing/leveling
|
||||
export default class DetectListLevels extends ToTextItemBlockTransformation {
|
||||
export default class DetectListLevels extends ToLineItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Level Lists");
|
||||
@ -21,23 +22,25 @@ export default class DetectListLevels extends ToTextItemBlockTransformation {
|
||||
var currentLevel = 0;
|
||||
const xByLevel = {};
|
||||
var modifiedBlock = false;
|
||||
listBlock.textItems.forEach(textItem => {
|
||||
listBlock.items.forEach(item => {
|
||||
const isListItem = true;
|
||||
if (lastItemX && isListItem) {
|
||||
if (textItem.x > lastItemX) {
|
||||
if (item.x > lastItemX) {
|
||||
currentLevel++;
|
||||
xByLevel[textItem.x] = currentLevel;
|
||||
} else if (textItem.x < lastItemX) {
|
||||
currentLevel = xByLevel[textItem.x];
|
||||
xByLevel[item.x] = currentLevel;
|
||||
} else if (item.x < lastItemX) {
|
||||
currentLevel = xByLevel[item.x];
|
||||
}
|
||||
} else {
|
||||
xByLevel[textItem.x] = 0;
|
||||
xByLevel[item.x] = 0;
|
||||
}
|
||||
if (currentLevel > 0) {
|
||||
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
|
||||
item.words = [new Word({
|
||||
string: ' '.repeat(currentLevel * 3)
|
||||
})].concat(item.words);
|
||||
modifiedBlock = true;
|
||||
}
|
||||
lastItemX = textItem.x;
|
||||
lastItemX = item.x;
|
||||
});
|
||||
listBlocks++;
|
||||
if (modifiedBlock) {
|
||||
|
@ -1,11 +1,11 @@
|
||||
import ToTextItemBlockTransformation from '../ToTextItemBlockTransformation.jsx';
|
||||
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItemBlock from '../../TextItemBlock.jsx';
|
||||
import LineItemBlock from '../../LineItemBlock.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import { minXFromTextItems } from '../../../textItemFunctions.jsx';
|
||||
import { minXFromPageItems } from '../../../pageItemFunctions.jsx';
|
||||
|
||||
// Gathers lines to blocks
|
||||
export default class GatherBlocks extends ToTextItemBlockTransformation {
|
||||
export default class GatherBlocks extends ToLineItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Gather Blocks");
|
||||
@ -14,29 +14,29 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
var textItems = 0;
|
||||
var lineItemCount = 0;
|
||||
parseResult.pages.map(page => {
|
||||
textItems += page.items.length;
|
||||
lineItemCount += page.items.length;
|
||||
const blocks = [];
|
||||
var stashedBlock = new TextItemBlock({});
|
||||
var stashedBlock = new LineItemBlock({});
|
||||
const flushStashedItems = () => {
|
||||
if (stashedBlock.textItems.length > 1) {
|
||||
if (stashedBlock.items.length > 1) {
|
||||
stashedBlock.annotation = DETECTED_ANNOTATION;
|
||||
}
|
||||
|
||||
blocks.push(stashedBlock);
|
||||
stashedBlock = new TextItemBlock({});
|
||||
stashedBlock = new LineItemBlock({});
|
||||
createdBlocks++;
|
||||
};
|
||||
|
||||
var minX = minXFromTextItems(page.items);
|
||||
var minX = minXFromPageItems(page.items);
|
||||
page.items.forEach(item => {
|
||||
if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
||||
if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
||||
flushStashedItems();
|
||||
}
|
||||
stashedBlock.addTextItem(item);
|
||||
stashedBlock.addItem(item);
|
||||
});
|
||||
if (stashedBlock.textItems.length > 0) {
|
||||
if (stashedBlock.items.length > 0) {
|
||||
flushStashedItems();
|
||||
}
|
||||
page.items = blocks;
|
||||
@ -44,7 +44,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
|
||||
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items']
|
||||
});
|
||||
}
|
||||
|
||||
@ -54,7 +54,7 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
||||
return false;
|
||||
}
|
||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||
const lastItem = stashedBlock.items[stashedBlock.items.length - 1];
|
||||
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
|
||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
|
||||
return false;
|
||||
|
@ -1,10 +1,10 @@
|
||||
import TextItemBlock from './models/TextItemBlock.jsx';
|
||||
import TextItem from './models/TextItem.jsx';
|
||||
import PageItem from './models/PageItem.jsx';
|
||||
import LineItemBlock from './models/LineItemBlock.jsx';
|
||||
|
||||
export function minXFromBlocks(blocks:TextItemBlock[]) {
|
||||
export function minXFromBlocks(blocks:LineItemBlock[]) {
|
||||
var minX = 999;
|
||||
blocks.forEach(block => {
|
||||
block.textItems.forEach(item => {
|
||||
block.items.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
});
|
||||
});
|
||||
@ -14,7 +14,7 @@ export function minXFromBlocks(blocks:TextItemBlock[]) {
|
||||
return minX;
|
||||
}
|
||||
|
||||
export function minXFromTextItems(items:TextItem) {
|
||||
export function minXFromPageItems(items:PageItem) {
|
||||
var minX = 999;
|
||||
items.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
@ -25,13 +25,13 @@ export function minXFromTextItems(items:TextItem) {
|
||||
return minX;
|
||||
}
|
||||
|
||||
export function sortByX(items:TextItem) {
|
||||
export function sortByX(items:PageItem) {
|
||||
items.sort((a, b) => {
|
||||
return a.x - b.x;
|
||||
});
|
||||
}
|
||||
|
||||
export function sortCopyByX(items:TextItem) {
|
||||
export function sortCopyByX(items:PageItem) {
|
||||
const copy = items.concat();
|
||||
sortByX(copy);
|
||||
return copy;
|
@ -1,31 +1,30 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
|
||||
import TextItem from '../src/javascript/models/TextItem.jsx';
|
||||
import LineItem from '../src/javascript/models/LineItem.jsx';
|
||||
|
||||
describe('HeadlineFinder', () => {
|
||||
|
||||
|
||||
it('Not Found - Case 1', () => {
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: 'My Little Headline'
|
||||
});
|
||||
const item1 = new TextItem({
|
||||
const item1 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item2 = new TextItem({
|
||||
const item2 = new LineItem({
|
||||
text: 'Little'
|
||||
});
|
||||
const item3 = new TextItem({
|
||||
const item3 = new LineItem({
|
||||
text: ' Headline2'
|
||||
});
|
||||
|
||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.consume(item3)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
|
||||
|
||||
});
|
||||
|
||||
@ -33,22 +32,22 @@ describe('HeadlineFinder', () => {
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: 'My Little Headline'
|
||||
});
|
||||
const item1 = new TextItem({
|
||||
const item1 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item2 = new TextItem({
|
||||
const item2 = new LineItem({
|
||||
text: 'Little'
|
||||
});
|
||||
const item3 = new TextItem({
|
||||
const item3 = new LineItem({
|
||||
text: ' Headline'
|
||||
});
|
||||
|
||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
|
||||
});
|
||||
|
||||
@ -56,27 +55,27 @@ describe('HeadlineFinder', () => {
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: 'My Little Headline'
|
||||
});
|
||||
const item0 = new TextItem({
|
||||
const item0 = new LineItem({
|
||||
text: 'Waste '
|
||||
});
|
||||
const item1 = new TextItem({
|
||||
const item1 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item2 = new TextItem({
|
||||
const item2 = new LineItem({
|
||||
text: 'Little'
|
||||
});
|
||||
const item3 = new TextItem({
|
||||
const item3 = new LineItem({
|
||||
text: ' Headline'
|
||||
});
|
||||
|
||||
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
|
||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
|
||||
});
|
||||
|
||||
@ -84,27 +83,27 @@ describe('HeadlineFinder', () => {
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: 'My Little Headline'
|
||||
});
|
||||
const item0 = new TextItem({
|
||||
const item0 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item1 = new TextItem({
|
||||
const item1 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item2 = new TextItem({
|
||||
const item2 = new LineItem({
|
||||
text: 'Little'
|
||||
});
|
||||
const item3 = new TextItem({
|
||||
const item3 = new LineItem({
|
||||
text: ' Headline'
|
||||
});
|
||||
|
||||
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item0);
|
||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
|
||||
});
|
||||
|
||||
@ -112,22 +111,22 @@ describe('HeadlineFinder', () => {
|
||||
const headlineFinder = new HeadlineFinder({
|
||||
headline: 'MYLitt le HEADline'
|
||||
});
|
||||
const item1 = new TextItem({
|
||||
const item1 = new LineItem({
|
||||
text: 'My '
|
||||
});
|
||||
const item2 = new TextItem({
|
||||
const item2 = new LineItem({
|
||||
text: 'Little'
|
||||
});
|
||||
const item3 = new TextItem({
|
||||
const item3 = new LineItem({
|
||||
text: ' Headline'
|
||||
});
|
||||
|
||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||
|
||||
});
|
||||
|
||||
|
@ -2,9 +2,10 @@ import { expect } from 'chai';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||
|
||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
|
||||
it('single word', () => {
|
||||
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
|
||||
|
||||
@ -38,7 +39,7 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('removeLeadingWhitespaces', () => {
|
||||
describe('functions: removeLeadingWhitespaces', () => {
|
||||
it('No Removes', () => {
|
||||
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
|
||||
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
|
||||
@ -54,7 +55,7 @@ describe('removeLeadingWhitespaces', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('removeTrailingWhitespaces', () => {
|
||||
describe('functions: removeTrailingWhitespaces', () => {
|
||||
it('No Removes', () => {
|
||||
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
|
||||
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
|
||||
@ -71,7 +72,7 @@ describe('removeTrailingWhitespaces', () => {
|
||||
});
|
||||
|
||||
|
||||
describe('prefixAfterWhitespace', () => {
|
||||
describe('functions: prefixAfterWhitespace', () => {
|
||||
it('Basic', () => {
|
||||
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
|
||||
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
|
||||
@ -81,7 +82,7 @@ describe('prefixAfterWhitespace', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('suffixBeforeWhitespace', () => {
|
||||
describe('functions: suffixBeforeWhitespace', () => {
|
||||
it('Basic', () => {
|
||||
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
|
||||
@ -92,7 +93,7 @@ describe('suffixBeforeWhitespace', () => {
|
||||
});
|
||||
|
||||
|
||||
describe('charCodeArray', () => {
|
||||
describe('functions: charCodeArray', () => {
|
||||
it('Charcodes', () => {
|
||||
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
|
||||
});
|
||||
@ -105,7 +106,7 @@ describe('charCodeArray', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('normalizedCharCodeArray', () => {
|
||||
describe('functions: normalizedCharCodeArray', () => {
|
||||
|
||||
it('No Change', () => {
|
||||
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
|
||||
@ -131,7 +132,7 @@ describe('normalizedCharCodeArray', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('isListItem', () => {
|
||||
describe('functions: isListItem', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(isListItem('- my text')).to.equal(true);
|
||||
@ -154,7 +155,7 @@ describe('isListItem', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('isNumberedListItem', () => {
|
||||
describe('functions: isNumberedListItem', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(isNumberedListItem('1. my text')).to.equal(true);
|
||||
@ -173,7 +174,7 @@ describe('isNumberedListItem', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('wordsMatch', () => {
|
||||
describe('functions: wordsMatch', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
|
||||
|
64
test/models/StashingStream.spec.js
Normal file
64
test/models/StashingStream.spec.js
Normal file
@ -0,0 +1,64 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import StashingStream from '../../src/javascript/models/StashingStream';
|
||||
import TextItem from '../../src/javascript/models/TextItem.jsx';
|
||||
|
||||
describe('StashingStream', () => {
|
||||
|
||||
it('Simple', () => {
|
||||
const stream = new MyStashingStream();
|
||||
|
||||
stream.consume('a');
|
||||
stream.consume('b');
|
||||
stream.consume('a');
|
||||
stream.consume('a');
|
||||
stream.consume('z');
|
||||
stream.consume('m');
|
||||
stream.consume('m');
|
||||
stream.consume('z');
|
||||
stream.consume('z');
|
||||
stream.consume('c');
|
||||
stream.consume('e');
|
||||
stream.consume('f');
|
||||
stream.consume('m');
|
||||
stream.consume('a');
|
||||
|
||||
const resultsAsString = stream.complete().join('');
|
||||
|
||||
expect(resultsAsString).to.equal('AbAAZZZcefA');
|
||||
expect(stream.transformedItems).to.equal(10);
|
||||
});
|
||||
|
||||
it('ConsumeAll', () => {
|
||||
const items = ['k', 'k', 'x', 'a', 'm', 'z', 'o', 'p']
|
||||
const stream = new MyStashingStream();
|
||||
stream.consumeAll(items);
|
||||
|
||||
const resultsAsString = stream.complete().join('');
|
||||
expect(resultsAsString).to.equal('kkxAZop');
|
||||
expect(stream.transformedItems).to.equal(3);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
class MyStashingStream extends StashingStream {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.transformedItems = 0;
|
||||
}
|
||||
|
||||
shouldStash(item) {
|
||||
return item === 'a' || item === 'z' || item === 'm';
|
||||
}
|
||||
|
||||
doMatchesStash(lastItem, item) {
|
||||
return lastItem === item;
|
||||
}
|
||||
|
||||
doFlushStash(stash, results) {
|
||||
this.transformedItems += stash.length;
|
||||
results.push(...stash.filter(elem => elem !== 'm').map(item => item.toUpperCase()));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user