mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
WIP Introduce word/wordType/lineItem
* Way to do the markdown transformation of inline formats (bold, italic, link, footnote, etc..) at the end and not in the middle * Introduce StashingStream as a helper
This commit is contained in:
parent
fde670e83f
commit
09facb09b4
@ -7,7 +7,7 @@
|
|||||||
"watch": "webpack -d --watch",
|
"watch": "webpack -d --watch",
|
||||||
"build": "webpack",
|
"build": "webpack",
|
||||||
"lint": "eslint src --ext .js --ext .jsx --cache",
|
"lint": "eslint src --ext .js --ext .jsx --cache",
|
||||||
"test": "mocha --compilers js:babel-core/register test/*.spec.js",
|
"test": "mocha --compilers js:babel-core/register test --recursive",
|
||||||
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
||||||
"deploy": "npm run release && cp -r build/* docs/"
|
"deploy": "npm run release && cp -r build/* docs/"
|
||||||
},
|
},
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
import React from 'react';
|
import React from 'react';
|
||||||
import PageView from './PageView.jsx';
|
import PageView from './PageView.jsx';
|
||||||
import TextItemTable from './TextItemTable.jsx';
|
import LineItemTable from './LineItemTable.jsx';
|
||||||
|
|
||||||
// View for a Page which items are of kind TextItemBlock
|
// View for a Page which items are of kind LineItemBlock
|
||||||
export default class TextItemBlockPageView extends PageView {
|
export default class LineItemBlockPageView extends PageView {
|
||||||
|
|
||||||
createItemViews(items, showWhitespaces) {
|
createItemViews(items, showWhitespaces) {
|
||||||
const blockTables = items.map((block, i) => {
|
const blockTables = items.map((block, i) => {
|
||||||
var textItems = block.textItems;
|
|
||||||
const blockType = block.type ? ' - ' + block.type.name : null;
|
const blockType = block.type ? ' - ' + block.type.name : null;
|
||||||
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
||||||
: null;
|
: null;
|
||||||
@ -38,7 +37,7 @@ export default class TextItemBlockPageView extends PageView {
|
|||||||
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
|
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
|
||||||
</div>
|
</div>
|
||||||
<div style={ borderStyle }>
|
<div style={ borderStyle }>
|
||||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
<LineItemTable items={ block.items } showWhitespaces={ showWhitespaces } />
|
||||||
{ footnoteLinks }
|
{ footnoteLinks }
|
||||||
{ footnotes }
|
{ footnotes }
|
||||||
</div>
|
</div>
|
12
src/javascript/components/debug/LineItemPageView.jsx
Normal file
12
src/javascript/components/debug/LineItemPageView.jsx
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import PageView from './PageView.jsx';
|
||||||
|
import LineItemTable from './LineItemTable.jsx';
|
||||||
|
|
||||||
|
// View for a Page which items are of kind LineItem
|
||||||
|
export default class LineItemPageView extends PageView {
|
||||||
|
|
||||||
|
createItemViews(items, showWhitespaces) {
|
||||||
|
return <LineItemTable items={ items } showWhitespaces={ showWhitespaces } />
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
108
src/javascript/components/debug/LineItemTable.jsx
Normal file
108
src/javascript/components/debug/LineItemTable.jsx
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
import Table from 'react-bootstrap/lib/Table'
|
||||||
|
|
||||||
|
// Displays an array of LineItem as a table
|
||||||
|
export default class LineItemTable extends React.Component {
|
||||||
|
|
||||||
|
static propTypes = {
|
||||||
|
items: React.PropTypes.array.isRequired,
|
||||||
|
showWhitespaces: React.PropTypes.bool
|
||||||
|
};
|
||||||
|
|
||||||
|
render() {
|
||||||
|
const {showWhitespaces, items} = this.props;
|
||||||
|
const tableHeader = <thead>
|
||||||
|
<tr>
|
||||||
|
<th>
|
||||||
|
#
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Text
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
X
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Y
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Width
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Height
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
|
||||||
|
const itemRows = items.map((item, i) => <tr key={ i } style={ item.annotation ? {
|
||||||
|
color: item.annotation.color
|
||||||
|
} : null }>
|
||||||
|
<td>
|
||||||
|
<div style={ { textAlign: 'center' } }>
|
||||||
|
{ i }
|
||||||
|
</div>
|
||||||
|
<div style={ { textAlign: 'center' } }>
|
||||||
|
{ item.annotation ? item.annotation.category : '' }
|
||||||
|
</div>
|
||||||
|
<div style={ { textAlign: 'center', color: 'brown' } }>
|
||||||
|
{ item.type ? item.type.name : '' }
|
||||||
|
</div>
|
||||||
|
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||||
|
{ item.parsedElements && item.parsedElements.footnoteLinks.length > 0 ? <div>
|
||||||
|
Footnote-Link
|
||||||
|
</div> : '' }
|
||||||
|
{ item.parsedElements && item.parsedElements.containLinks ? <div>
|
||||||
|
Link
|
||||||
|
</div> : '' }
|
||||||
|
{ item.lineFormat ? <div>
|
||||||
|
{ item.lineFormat.name }
|
||||||
|
</div> : '' }
|
||||||
|
{ item.unopenedFormat ? <div>
|
||||||
|
Unopened
|
||||||
|
{ ' ' + item.unopenedFormat.name }
|
||||||
|
</div> : '' }
|
||||||
|
{ item.parsedElements && item.parsedElements.inlineFormats > 0 ? <div>
|
||||||
|
{ item.parsedElements.inlineFormats + 'x Bold/Italic' }
|
||||||
|
</div> : '' }
|
||||||
|
{ item.unclosedFormat ? <div>
|
||||||
|
Unclosed
|
||||||
|
{ ' ' + item.unclosedFormat.name }
|
||||||
|
</div> : '' }
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ showWhitespaces ? (
|
||||||
|
<pre style={ item.annotation ? {
|
||||||
|
color: item.annotation.color,
|
||||||
|
display: 'inline-block',
|
||||||
|
} : {
|
||||||
|
display: 'inline-block'
|
||||||
|
} }>{ item.text() }</pre>
|
||||||
|
) : (item.text()) }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ item.x }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ item.y }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ item.width }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ item.height }
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Table responsive condensed bordered>
|
||||||
|
{ tableHeader }
|
||||||
|
<tbody>
|
||||||
|
{ itemRows }
|
||||||
|
</tbody>
|
||||||
|
</Table>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -18,6 +18,17 @@ export function isNumber(string) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function hasOnly(string, char) {
|
||||||
|
const charCode = char.charCodeAt(0);
|
||||||
|
for (var i = 0; i < string.length; i++) {
|
||||||
|
const aCharCode = string.charCodeAt(i);
|
||||||
|
if (aCharCode != charCode) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
export function hasUpperCaseCharacterInMiddleOfWord(text) {
|
export function hasUpperCaseCharacterInMiddleOfWord(text) {
|
||||||
var beginningOfWord = true;
|
var beginningOfWord = true;
|
||||||
for (var i = 0; i < text.length; i++) {
|
for (var i = 0; i < text.length; i++) {
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
||||||
|
|
||||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||||
@ -56,10 +57,10 @@ export default class AppState {
|
|||||||
new CompactLines(),
|
new CompactLines(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
new PostprocessLines(),
|
// new PostprocessLines(),
|
||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
new DetectHeaders(),
|
new DetectHeaders(),
|
||||||
new CompleteFormats(),
|
// new CompleteFormats(),
|
||||||
new DetectListItems(),
|
new DetectListItems(),
|
||||||
|
|
||||||
new GatherBlocks(),
|
new GatherBlocks(),
|
||||||
|
@ -1,83 +1,85 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
import TextItem from './TextItem.jsx';
|
import LineItem from './LineItem.jsx';
|
||||||
import TextItemBlock from './TextItemBlock.jsx';
|
import LineItemBlock from './LineItemBlock.jsx';
|
||||||
|
|
||||||
// An Markdown element
|
// An Markdown element
|
||||||
export default class ElementType extends Enum {
|
export default class ElementType extends Enum {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO rename to BlockType
|
||||||
|
|
||||||
ElementType.initEnum({
|
ElementType.initEnum({
|
||||||
H1: {
|
H1: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 1,
|
headlineLevel: 1,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '# ' + concatTextItems(block.textItems);
|
return '# ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
H2: {
|
H2: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 2,
|
headlineLevel: 2,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '## ' + concatTextItems(block.textItems);
|
return '## ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
H3: {
|
H3: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 3,
|
headlineLevel: 3,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '### ' + concatTextItems(block.textItems);
|
return '### ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
H4: {
|
H4: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 4,
|
headlineLevel: 4,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '#### ' + concatTextItems(block.textItems);
|
return '#### ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
H5: {
|
H5: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 5,
|
headlineLevel: 5,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '##### ' + concatTextItems(block.textItems);
|
return '##### ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
H6: {
|
H6: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 6,
|
headlineLevel: 6,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '###### ' + concatTextItems(block.textItems);
|
return '###### ' + concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
TOC: {
|
TOC: {
|
||||||
mergeToBlock: true,
|
mergeToBlock: true,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return concatTextItems(block.textItems);
|
return concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
FOOTNOTES: {
|
FOOTNOTES: {
|
||||||
mergeToBlock: true,
|
mergeToBlock: true,
|
||||||
mergeFollowingNonTypedItems: true,
|
mergeFollowingNonTypedItems: true,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return concatTextItems(block.textItems);
|
return concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
CODE: {
|
CODE: {
|
||||||
mergeToBlock: true,
|
mergeToBlock: true,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
return '```\n' + concatLineItems(block.items) + '```'
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
LIST: {
|
LIST: {
|
||||||
mergeToBlock: true,
|
mergeToBlock: true,
|
||||||
mergeFollowingNonTypedItemsWithSmallDistance: true,
|
mergeFollowingNonTypedItemsWithSmallDistance: true,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return concatTextItems(block.textItems);
|
return concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
PARAGRAPH: {
|
PARAGRAPH: {
|
||||||
toText(block:TextItemBlock) {
|
toText(block:LineItemBlock) {
|
||||||
return concatTextItems(block.textItems);
|
return concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -86,17 +88,17 @@ export function isHeadline(elementType: ElementType) {
|
|||||||
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
||||||
}
|
}
|
||||||
|
|
||||||
export function blockToText(block: TextItemBlock) {
|
export function blockToText(block: LineItemBlock) {
|
||||||
if (!block.type) {
|
if (!block.type) {
|
||||||
return concatTextItems(block.textItems);
|
return concatLineItems(block.items);
|
||||||
}
|
}
|
||||||
return block.type.toText(block);
|
return block.type.toText(block);
|
||||||
}
|
}
|
||||||
|
|
||||||
function concatTextItems(textItems: TextItem[]) {
|
function concatLineItems(lineItems: LineItem[]) {
|
||||||
var text = '';
|
var text = '';
|
||||||
textItems.forEach(item => {
|
lineItems.forEach(item => {
|
||||||
text += item.text + '\n';
|
text += item.text() + '\n';
|
||||||
});
|
});
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
@ -4,24 +4,25 @@ export default class HeadlineFinder {
|
|||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.headlineCharCodes = normalizedCharCodeArray(options.headline);
|
this.headlineCharCodes = normalizedCharCodeArray(options.headline);
|
||||||
this.stackedTextItems = [];
|
this.stackedLineItems = [];
|
||||||
this.stackedChars = 0;
|
this.stackedChars = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
consume(textItem) {
|
consume(lineItem) {
|
||||||
const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
|
//TODO avoid join
|
||||||
|
const normalizedCharCodes = normalizedCharCodeArray(lineItem.text());
|
||||||
const matchAll = this.matchAll(normalizedCharCodes);
|
const matchAll = this.matchAll(normalizedCharCodes);
|
||||||
if (matchAll) {
|
if (matchAll) {
|
||||||
this.stackedTextItems.push(textItem);
|
this.stackedLineItems.push(lineItem);
|
||||||
this.stackedChars += normalizedCharCodes.length;
|
this.stackedChars += normalizedCharCodes.length;
|
||||||
if (this.stackedChars == this.headlineCharCodes.length) {
|
if (this.stackedChars == this.headlineCharCodes.length) {
|
||||||
return this.stackedTextItems;
|
return this.stackedLineItems;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (this.stackedChars > 0) {
|
if (this.stackedChars > 0) {
|
||||||
this.stackedChars = 0;
|
this.stackedChars = 0;
|
||||||
this.stackedTextItems = [];
|
this.stackedLineItems = [];
|
||||||
this.consume(textItem); // test again without stack
|
this.consume(lineItem); // test again without stack
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
145
src/javascript/models/LineConverter.jsx
Normal file
145
src/javascript/models/LineConverter.jsx
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
import TextItem from './TextItem.jsx';
|
||||||
|
import Word from './Word.jsx';
|
||||||
|
import WordType from './markdown/WordType.jsx';
|
||||||
|
import LineItem from './LineItem.jsx';
|
||||||
|
import StashingStream from './StashingStream.jsx';
|
||||||
|
import { ParsedElements } from './PageItem.jsx';
|
||||||
|
import { isNumber } from '../functions.jsx'
|
||||||
|
import { sortByX } from '../pageItemFunctions.jsx'
|
||||||
|
|
||||||
|
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
|
||||||
|
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||||
|
export default class LineConverter {
|
||||||
|
|
||||||
|
constructor(fontToFormats) {
|
||||||
|
this.fontToFormats = fontToFormats;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns a CombineResult
|
||||||
|
compact(textItems: TextItem[]) {
|
||||||
|
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||||
|
sortByX(textItems);
|
||||||
|
|
||||||
|
const wordStream = new WordDetectionStream(this.fontToFormats);
|
||||||
|
wordStream.consumeAll(textItems.map(item => new TextItem({
|
||||||
|
...item
|
||||||
|
})));
|
||||||
|
const words = wordStream.complete();
|
||||||
|
|
||||||
|
var maxHeight = 0;
|
||||||
|
var widthSum = 0;
|
||||||
|
textItems.forEach(item => {
|
||||||
|
maxHeight = Math.max(maxHeight, item.height);
|
||||||
|
widthSum += item.width;
|
||||||
|
});
|
||||||
|
return new LineItem({
|
||||||
|
x: textItems[0].x,
|
||||||
|
y: textItems[0].y,
|
||||||
|
height: maxHeight,
|
||||||
|
width: widthSum,
|
||||||
|
words: words,
|
||||||
|
parsedElements: new ParsedElements({
|
||||||
|
footnoteLinks: wordStream.footnoteLinks,
|
||||||
|
footnotes: wordStream.footnotes
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function itemsToWords(items, format) {
|
||||||
|
const combinedText = combineText(items);
|
||||||
|
// const combinedText = items.map(textItem => textItem.text).join('');
|
||||||
|
const words = combinedText.split(' ');
|
||||||
|
return words.filter(w => w.trim().length > 0).map(word => {
|
||||||
|
return new Word({
|
||||||
|
string: word,
|
||||||
|
type: format
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function combineText(textItems) {
|
||||||
|
var text = '';
|
||||||
|
var lastItem;
|
||||||
|
textItems.forEach(textItem => {
|
||||||
|
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
|
||||||
|
const xDistance = textItem.x - lastItem.x - lastItem.width;
|
||||||
|
if (xDistance > 5) {
|
||||||
|
text += ' ';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text += textItem.text;
|
||||||
|
lastItem = textItem;
|
||||||
|
});
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
class WordDetectionStream extends StashingStream {
|
||||||
|
|
||||||
|
constructor(fontToFormats) {
|
||||||
|
super();
|
||||||
|
this.fontToFormats = fontToFormats;
|
||||||
|
this.footnoteLinks = [];
|
||||||
|
this.footnotes = [];
|
||||||
|
|
||||||
|
this.firstY;
|
||||||
|
this.stashedNumber = false;
|
||||||
|
this.currentItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStash(item) { // eslint-disable-line no-unused-vars
|
||||||
|
if (!this.firstY) {
|
||||||
|
this.firstY = item.y;
|
||||||
|
}
|
||||||
|
this.currentItem = item;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
onPushOnStash(item) { // eslint-disable-line no-unused-vars
|
||||||
|
this.stashedNumber = isNumber(item.text.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
doMatchesStash(lastItem, item) {
|
||||||
|
const lastItemFormat = this.fontToFormats.get(lastItem.font);
|
||||||
|
const itemFormat = this.fontToFormats.get(item.font);
|
||||||
|
if (lastItemFormat !== itemFormat) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const itemIsANumber = isNumber(item.text.trim());
|
||||||
|
return this.stashedNumber == itemIsANumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
doFlushStash(stash, results) {
|
||||||
|
if (this.stashedNumber) {
|
||||||
|
const joinedNumber = stash.map(item => item.text).join('');
|
||||||
|
if (stash[0].y > this.firstY) { // footnote link
|
||||||
|
results.push(new Word({
|
||||||
|
string: `${joinedNumber}`,
|
||||||
|
type: WordType.FOOTNOTE_LINK
|
||||||
|
//TODO format to
|
||||||
|
//^
|
||||||
|
//`<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||||
|
}));
|
||||||
|
this.footnoteLinks.push(parseInt(joinedNumber));
|
||||||
|
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
|
||||||
|
results.push(new Word({
|
||||||
|
string: `${joinedNumber}`,
|
||||||
|
type: WordType.FOOTNOTE
|
||||||
|
//TODO format to (^${ joinedNumber}):
|
||||||
|
}));
|
||||||
|
this.footnotes.push(joinedNumber);
|
||||||
|
} else {
|
||||||
|
this.copyStashItemsAsText(stash, results);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.copyStashItemsAsText(stash, results);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
copyStashItemsAsText(stash, results) {
|
||||||
|
const format = this.fontToFormats.get(stash[0].font);
|
||||||
|
results.push(...itemsToWords(stash, format));
|
||||||
|
}
|
||||||
|
}
|
29
src/javascript/models/LineItem.jsx
Normal file
29
src/javascript/models/LineItem.jsx
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import PageItem from './PageItem.jsx'
|
||||||
|
import Word from './Word.jsx'
|
||||||
|
|
||||||
|
//A line within a page
|
||||||
|
export default class LineItem extends PageItem {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
super(options);
|
||||||
|
this.x = options.x;
|
||||||
|
this.y = options.y;
|
||||||
|
this.width = options.width;
|
||||||
|
this.height = options.height;
|
||||||
|
this.words = options.words || [];
|
||||||
|
if (options.text && !options.words) {
|
||||||
|
this.words = options.text.split(" ").filter(string => string.trim().length > 0).map(wordAsString => new Word({
|
||||||
|
string: wordAsString
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
text() {
|
||||||
|
return this.wordStrings().join(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
wordStrings() {
|
||||||
|
return this.words.map(word => word.string);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
36
src/javascript/models/LineItemBlock.jsx
Normal file
36
src/javascript/models/LineItemBlock.jsx
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import PageItem from './PageItem.jsx'
|
||||||
|
import LineItem from './LineItem.jsx'
|
||||||
|
|
||||||
|
// A block of LineItem[] within a Page
|
||||||
|
export default class LineItemBlock extends PageItem {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
super(options);
|
||||||
|
this.items = [];
|
||||||
|
if (options.items) {
|
||||||
|
options.items.forEach(item => this.addItem(item));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
addItem(item:LineItem) {
|
||||||
|
if (this.type && item.type && this.type !== item.type) {
|
||||||
|
throw `Adding item of type ${item.type} to block of type ${this.type}`
|
||||||
|
}
|
||||||
|
if (!this.type) {
|
||||||
|
this.type = item.type;
|
||||||
|
}
|
||||||
|
if (item.parsedElements) {
|
||||||
|
if (this.parsedElements) {
|
||||||
|
this.parsedElements.add(item.parsedElements);
|
||||||
|
} else {
|
||||||
|
this.parsedElements = item.parsedElements;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const copiedItem = new LineItem({
|
||||||
|
...item
|
||||||
|
});
|
||||||
|
copiedItem.type = null;
|
||||||
|
this.items.push(copiedItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
// A abstract PageItem class, can be TextItem, or TextItemBlock
|
// A abstract PageItem class, can be TextItem, LineItem or LineItemBlock
|
||||||
export default class PageItem {
|
export default class PageItem {
|
||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
|
73
src/javascript/models/StashingStream.jsx
Normal file
73
src/javascript/models/StashingStream.jsx
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
//Abstract stream which allows stash items temporarily
|
||||||
|
export default class StashingStream {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
if (this.constructor === StashingStream) {
|
||||||
|
throw new TypeError("Can not construct abstract class.");
|
||||||
|
}
|
||||||
|
this.results = [];
|
||||||
|
this.stash = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
consumeAll(items) {
|
||||||
|
items.forEach(item => this.consume(item));
|
||||||
|
}
|
||||||
|
|
||||||
|
consume(item) {
|
||||||
|
if (this.shouldStash(item)) {
|
||||||
|
if (!this.matchesStash(item)) {
|
||||||
|
this.flushStash();
|
||||||
|
}
|
||||||
|
this.pushOnStash(item);
|
||||||
|
} else {
|
||||||
|
if (this.stash.length > 0) {
|
||||||
|
this.flushStash();
|
||||||
|
}
|
||||||
|
this.results.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pushOnStash(item) {
|
||||||
|
this.onPushOnStash(item);
|
||||||
|
this.stash.push(item);
|
||||||
|
}
|
||||||
|
|
||||||
|
complete() {
|
||||||
|
if (this.stash.length > 0) {
|
||||||
|
this.flushStash();
|
||||||
|
}
|
||||||
|
return this.results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return true if the item matches the items of the stack
|
||||||
|
matchesStash(item) {
|
||||||
|
if (this.stash.length == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
const lastItem = this.stash[this.stash.length - 1];
|
||||||
|
return this.doMatchesStash(lastItem, item);
|
||||||
|
}
|
||||||
|
|
||||||
|
flushStash() {
|
||||||
|
if (this.stash.length > 0) {
|
||||||
|
this.doFlushStash(this.stash, this.results);
|
||||||
|
this.stash = [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
onPushOnStash(item) { // eslint-disable-line no-unused-vars
|
||||||
|
//sub-classes may override
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStash(item) {
|
||||||
|
throw new TypeError("Do not call abstract method foo from child." + item);
|
||||||
|
}
|
||||||
|
|
||||||
|
doMatchesStash(lastItem, item) {
|
||||||
|
throw new TypeError("Do not call abstract method foo from child." + lastItem + item);
|
||||||
|
}
|
||||||
|
|
||||||
|
doFlushStash(stash, results) {
|
||||||
|
throw new TypeError("Do not call abstract method foo from child." + stash + results);
|
||||||
|
}
|
||||||
|
}
|
@ -11,8 +11,6 @@ export default class TextItem extends PageItem {
|
|||||||
this.height = options.height;
|
this.height = options.height;
|
||||||
this.text = options.text;
|
this.text = options.text;
|
||||||
this.font = options.font;
|
this.font = options.font;
|
||||||
this.fontAscent = options.fontAscent;
|
|
||||||
this.fontDescent = options.fontDescent;
|
|
||||||
|
|
||||||
this.lineFormat = options.lineFormat;
|
this.lineFormat = options.lineFormat;
|
||||||
this.unopenedFormat = options.unopenedFormat;
|
this.unopenedFormat = options.unopenedFormat;
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
import PageItem from './PageItem.jsx'
|
|
||||||
import TextItem from './TextItem.jsx'
|
|
||||||
|
|
||||||
// A block of TextItem[] within a Page
|
|
||||||
export default class TextItemBlock extends PageItem {
|
|
||||||
|
|
||||||
constructor(options) {
|
|
||||||
super(options);
|
|
||||||
this.textItems = [];
|
|
||||||
if (options.textItems) {
|
|
||||||
options.textItems.forEach(item => this.addTextItem(item));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
addTextItem(textItem:TextItem) {
|
|
||||||
if (this.type && textItem.type && this.type !== textItem.type) {
|
|
||||||
throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
|
|
||||||
}
|
|
||||||
if (!this.type) {
|
|
||||||
this.type = textItem.type;
|
|
||||||
}
|
|
||||||
if (textItem.parsedElements) {
|
|
||||||
if (this.parsedElements) {
|
|
||||||
this.parsedElements.add(textItem.parsedElements);
|
|
||||||
} else {
|
|
||||||
this.parsedElements = textItem.parsedElements;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const copiedTextItem = new TextItem({
|
|
||||||
...textItem
|
|
||||||
});
|
|
||||||
copiedTextItem.type = null;
|
|
||||||
this.textItems.push(copiedTextItem);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,227 +0,0 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
|
||||||
import { ParsedElements } from './PageItem.jsx';
|
|
||||||
import { isNumber } from '../functions.jsx'
|
|
||||||
import { sortByX } from '../textItemFunctions.jsx'
|
|
||||||
import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
|
|
||||||
|
|
||||||
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
|
||||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
|
||||||
export default class TextItemLineCompactor {
|
|
||||||
|
|
||||||
constructor(fontToFormats) {
|
|
||||||
this.fontToFormats = fontToFormats;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns a CombineResult
|
|
||||||
compact(lineItems: TextItem[]) {
|
|
||||||
if (lineItems.length < 2) {
|
|
||||||
throw "Must be at least 2 line items, but was " + lineItems;
|
|
||||||
}
|
|
||||||
|
|
||||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
|
||||||
sortByX(lineItems);
|
|
||||||
|
|
||||||
const formatter = new Formatter(this.fontToFormats);
|
|
||||||
var [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
|
|
||||||
resolvedLineItems.forEach(item => formatter.consume(item));
|
|
||||||
resolvedLineItems = formatter.getResults();
|
|
||||||
parsedElements.inlineFormats = formatter.inlineFormats;
|
|
||||||
// const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
|
|
||||||
|
|
||||||
var combinedItem;
|
|
||||||
if (resolvedLineItems.length == 1) {
|
|
||||||
combinedItem = resolvedLineItems[0];
|
|
||||||
} else {
|
|
||||||
var text = '';
|
|
||||||
var maxHeight = 0;
|
|
||||||
var widthSum = 0;
|
|
||||||
var lastItem;
|
|
||||||
resolvedLineItems.forEach(item => {
|
|
||||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
|
||||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
|
||||||
if (xDistance >= 5) {
|
|
||||||
text += ' ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
text += item.text;
|
|
||||||
widthSum += item.width;
|
|
||||||
lastItem = item;
|
|
||||||
maxHeight = Math.max(maxHeight, item.height);
|
|
||||||
});
|
|
||||||
combinedItem = new TextItem({
|
|
||||||
...resolvedLineItems[0],
|
|
||||||
text: text,
|
|
||||||
height: maxHeight,
|
|
||||||
width: widthSum
|
|
||||||
});
|
|
||||||
}
|
|
||||||
combinedItem.parsedElements = parsedElements;
|
|
||||||
combinedItem.lineFormat = formatter.lineFormat;
|
|
||||||
combinedItem.unopenedFormat = formatter.unopenedFormat;
|
|
||||||
combinedItem.unclosedFormat = formatter.unclosedFormat;
|
|
||||||
return combinedItem;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
resolveSpecialElements(lineItems) {
|
|
||||||
const footnoteLinks = [];
|
|
||||||
const footnotes = [];
|
|
||||||
const basicY = lineItems[0].y;
|
|
||||||
const newLineItems = [];
|
|
||||||
var stashedNumberItems = [];
|
|
||||||
|
|
||||||
const commitStashedNumbers = (nextItem) => {
|
|
||||||
if (stashedNumberItems.length > 0) {
|
|
||||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
|
||||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
|
||||||
newLineItems.push(new TextItem({
|
|
||||||
...stashedNumberItems[0],
|
|
||||||
//TODO make fomatting configurable
|
|
||||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
|
||||||
text: `^${joinedNumber}`
|
|
||||||
}));
|
|
||||||
footnoteLinks.push(parseInt(joinedNumber));
|
|
||||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
|
||||||
//TODO womb comp [29] => ydiff == 0
|
|
||||||
newLineItems.push(new TextItem({
|
|
||||||
...stashedNumberItems[0],
|
|
||||||
text: `(^${ joinedNumber}): `
|
|
||||||
}));
|
|
||||||
footnotes.push(joinedNumber);
|
|
||||||
} else {
|
|
||||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
|
||||||
}
|
|
||||||
|
|
||||||
stashedNumberItems = [];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
lineItems.forEach(item => {
|
|
||||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
|
||||||
// skip whitespace on the beginning of a line
|
|
||||||
} else {
|
|
||||||
const isANumber = isNumber(item.text.trim());
|
|
||||||
if (isANumber) {
|
|
||||||
stashedNumberItems.push(item);
|
|
||||||
} else {
|
|
||||||
if (stashedNumberItems.length > 0) {
|
|
||||||
commitStashedNumbers(item);
|
|
||||||
}
|
|
||||||
newLineItems.push(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
commitStashedNumbers();
|
|
||||||
|
|
||||||
|
|
||||||
return [newLineItems, new ParsedElements({
|
|
||||||
footnoteLinks: footnoteLinks,
|
|
||||||
footnotes: footnotes
|
|
||||||
})];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
class Formatter {
|
|
||||||
|
|
||||||
constructor(fontToFormats) {
|
|
||||||
this.fontToFormats = fontToFormats;
|
|
||||||
|
|
||||||
this.resultItems = [];
|
|
||||||
this.lineFormat;
|
|
||||||
this.unopenedFormat;
|
|
||||||
this.unclosedFormat;
|
|
||||||
|
|
||||||
this.openFormat;
|
|
||||||
this.stashedItems = [];
|
|
||||||
this.inlineFormats = 0;
|
|
||||||
this.lastItem;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
consume(item) {
|
|
||||||
const formatType = this.fontToFormats.get(item.font);
|
|
||||||
if (this.openFormat && formatType !== this.openFormat) {
|
|
||||||
this.flushStash(false);
|
|
||||||
}
|
|
||||||
if (formatType.needFormat) {
|
|
||||||
this.openFormat = formatType;
|
|
||||||
this.stashedItems.push(item);
|
|
||||||
} else {
|
|
||||||
this.resultItems.push(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
getResults() {
|
|
||||||
if (this.openFormat) {
|
|
||||||
this.flushStash(true);
|
|
||||||
}
|
|
||||||
return this.resultItems;
|
|
||||||
}
|
|
||||||
|
|
||||||
flushStash(formatToEndOfLine) {
|
|
||||||
const formatFromBeginningOfLine = this.resultItems == 0;
|
|
||||||
if (formatFromBeginningOfLine) {
|
|
||||||
if (formatToEndOfLine) {
|
|
||||||
this.lineFormat = this.openFormat;
|
|
||||||
this.moveStashItemsToResult();
|
|
||||||
} else {
|
|
||||||
this.unopenedFormat = this.openFormat;
|
|
||||||
const newLastItem = this.newClosingItem(this.stashedItems.pop());
|
|
||||||
this.moveStashItemsToResult();
|
|
||||||
this.resultItems.push(newLastItem);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (formatToEndOfLine) {
|
|
||||||
this.unclosedFormat = this.openFormat;
|
|
||||||
const newFirstItem = this.newOpeningItem(this.stashedItems.shift());
|
|
||||||
this.resultItems.push(newFirstItem);
|
|
||||||
this.moveStashItemsToResult();
|
|
||||||
} else {
|
|
||||||
this.inlineFormats++;
|
|
||||||
if (this.stashedItems.length == 1) {
|
|
||||||
const onlyItem = this.stashedItems.pop();
|
|
||||||
if (onlyItem.text.trim().length > 0) {
|
|
||||||
const onlyItemFormatted = this.newCompleteItem(onlyItem);
|
|
||||||
this.resultItems.push(onlyItemFormatted);
|
|
||||||
}
|
|
||||||
this.moveStashItemsToResult();
|
|
||||||
} else {
|
|
||||||
const firstItem = this.newOpeningItem(this.stashedItems.shift());
|
|
||||||
const lastItem = this.newClosingItem(this.stashedItems.pop());
|
|
||||||
this.resultItems.push(firstItem);
|
|
||||||
this.moveStashItemsToResult();
|
|
||||||
this.resultItems.push(lastItem);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
moveStashItemsToResult() {
|
|
||||||
this.resultItems.push(...this.stashedItems);
|
|
||||||
this.stashedItems = [];
|
|
||||||
this.openFormat = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
newOpeningItem(item) {
|
|
||||||
return new TextItem({
|
|
||||||
...item,
|
|
||||||
text: prefixAfterWhitespace(this.openFormat.startSymbol, item.text)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
newClosingItem(item) {
|
|
||||||
return new TextItem({
|
|
||||||
...item,
|
|
||||||
text: suffixBeforeWhitespace(item.text, this.openFormat.endSymbol)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
newCompleteItem(item) {
|
|
||||||
return new TextItem({
|
|
||||||
...item,
|
|
||||||
text: suffixBeforeWhitespace(prefixAfterWhitespace(this.openFormat.startSymbol, item.text), this.openFormat.endSymbol)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,5 +1,5 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
import TextItem from './TextItem.jsx';
|
||||||
import { sortByX } from '../textItemFunctions.jsx'
|
import { sortByX } from '../pageItemFunctions.jsx'
|
||||||
|
|
||||||
//Groups all text items which are on the same y line
|
//Groups all text items which are on the same y line
|
||||||
export default class TextItemLineGrouper {
|
export default class TextItemLineGrouper {
|
||||||
|
8
src/javascript/models/Word.jsx
Normal file
8
src/javascript/models/Word.jsx
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
export default class Word {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.string = options.string;
|
||||||
|
this.type = options.type; // WordType
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
7
src/javascript/models/markdown/WordType.jsx
Normal file
7
src/javascript/models/markdown/WordType.jsx
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
|
// An Markdown word element
|
||||||
|
export default class WordType extends Enum {
|
||||||
|
}
|
||||||
|
|
||||||
|
WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']);
|
@ -1,16 +1,16 @@
|
|||||||
import React from 'react';
|
import React from 'react';
|
||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
import LineItemBlock from '../LineItemBlock.jsx';
|
||||||
import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
|
import LineItemBlockPageView from '../../components/debug/LineItemBlockPageView.jsx';
|
||||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
|
// Abstract class for transformations producing LineItemBlock(s) to be shown in the LineItemBlockPageView
|
||||||
export default class ToTextItemBlockTransformation extends Transformation {
|
export default class ToLineItemBlockTransformation extends Transformation {
|
||||||
|
|
||||||
constructor(name) {
|
constructor(name) {
|
||||||
super(name, TextItemBlock.name);
|
super(name, LineItemBlock.name);
|
||||||
if (this.constructor === ToTextItemBlockTransformation) {
|
if (this.constructor === ToLineItemBlockTransformation) {
|
||||||
throw new TypeError("Can not construct abstract class.");
|
throw new TypeError("Can not construct abstract class.");
|
||||||
}
|
}
|
||||||
this.showWhitespaces = false;
|
this.showWhitespaces = false;
|
||||||
@ -25,7 +25,7 @@ export default class ToTextItemBlockTransformation extends Transformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
createPageView(page, modificationsOnly) {
|
createPageView(page, modificationsOnly) {
|
||||||
return <TextItemBlockPageView
|
return <LineItemBlockPageView
|
||||||
key={ page.index }
|
key={ page.index }
|
||||||
page={ page }
|
page={ page }
|
||||||
modificationsOnly={ modificationsOnly }
|
modificationsOnly={ modificationsOnly }
|
@ -0,0 +1,46 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import LineItem from '../LineItem.jsx';
|
||||||
|
import LineItemPageView from '../../components/debug/LineItemPageView.jsx';
|
||||||
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
|
// Abstract class for transformations producing LineItem(s) to be shown in the LineItemPageView
|
||||||
|
export default class ToLineItemTransformation extends Transformation {
|
||||||
|
|
||||||
|
constructor(name) {
|
||||||
|
super(name, LineItem.name);
|
||||||
|
if (this.constructor === ToLineItemTransformation) {
|
||||||
|
throw new TypeError("Can not construct abstract class.");
|
||||||
|
}
|
||||||
|
this.showWhitespaces = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
showPageSelection() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
showModificationCheckbox() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
createPageView(page, modificationsOnly) {
|
||||||
|
return <LineItemPageView
|
||||||
|
key={ page.index }
|
||||||
|
page={ page }
|
||||||
|
modificationsOnly={ modificationsOnly }
|
||||||
|
showWhitespaces={ this.showWhitespaces } />;
|
||||||
|
}
|
||||||
|
|
||||||
|
completeTransform(parseResult:ParseResult) {
|
||||||
|
// The usual cleanup
|
||||||
|
parseResult.messages = [];
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||||
|
page.items.forEach(item => item.annotation = null);
|
||||||
|
});
|
||||||
|
return parseResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import StringFormat from '../../StringFormat.jsx';
|
import WordType from '../../markdown/WordType.jsx';
|
||||||
|
// import StringFormat from '../../StringFormat.jsx';
|
||||||
|
|
||||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||||
|
|
||||||
@ -54,21 +55,21 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
|||||||
this.fontMap.forEach(function(value, key) {
|
this.fontMap.forEach(function(value, key) {
|
||||||
fontIdToName.push(key + " = " + value.name)
|
fontIdToName.push(key + " = " + value.name)
|
||||||
const fontName = value.name.toLowerCase();
|
const fontName = value.name.toLowerCase();
|
||||||
var format;
|
var type;
|
||||||
if (key == mostUsedFont) {
|
if (key == mostUsedFont) {
|
||||||
format = StringFormat.STANDARD;
|
type = null;
|
||||||
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||||
format = StringFormat.BOLD_OBLIQUE;
|
type = WordType.BOLD_OBLIQUE;
|
||||||
} else if (fontName.includes('bold')) {
|
} else if (fontName.includes('bold')) {
|
||||||
format = StringFormat.BOLD;
|
type = WordType.BOLD;
|
||||||
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||||
format = StringFormat.OBLIQUE;
|
type = WordType.OBLIQUE;
|
||||||
} else if (fontName === maxHeightFont) {
|
} else if (fontName === maxHeightFont) {
|
||||||
format = StringFormat.BOLD;
|
type = WordType.BOLD;
|
||||||
} else {
|
}
|
||||||
format = StringFormat.STANDARD;
|
if (type) {
|
||||||
|
fontToFormats.set(key, type);
|
||||||
}
|
}
|
||||||
fontToFormats.set(key, format);
|
|
||||||
});
|
});
|
||||||
fontIdToName.sort();
|
fontIdToName.sort();
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
import React from 'react';
|
import React from 'react';
|
||||||
|
|
||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { ParsedElements } from '../../PageItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
||||||
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
|
import LineConverter from '../../LineConverter.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
|
|
||||||
// gathers text items on the same y line to one text item
|
// gathers text items on the same y line to one line item
|
||||||
export default class CompactLines extends ToTextItemTransformation {
|
export default class CompactLines extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Compact To Lines");
|
super("Compact To Lines");
|
||||||
@ -20,39 +20,32 @@ export default class CompactLines extends ToTextItemTransformation {
|
|||||||
const {mostUsedDistance, fontToFormats} = parseResult.globals;
|
const {mostUsedDistance, fontToFormats} = parseResult.globals;
|
||||||
const foundFootnotes = [];
|
const foundFootnotes = [];
|
||||||
const foundFootnoteLinks = [];
|
const foundFootnoteLinks = [];
|
||||||
var inlineFormats = 0;
|
var formattedWords = 0;
|
||||||
var lineFormats = 0;
|
|
||||||
var unopenedFormats = 0;
|
|
||||||
var unclosedFormats = 0;
|
|
||||||
|
|
||||||
const lineGrouper = new TextItemLineGrouper({
|
const lineGrouper = new TextItemLineGrouper({
|
||||||
mostUsedDistance: mostUsedDistance,
|
mostUsedDistance: mostUsedDistance,
|
||||||
});
|
});
|
||||||
const lineCompactor = new TextItemLineCompactor(fontToFormats);
|
const lineCompactor = new LineConverter(fontToFormats);
|
||||||
|
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
if (page.items.length > 0) {
|
if (page.items.length > 0) {
|
||||||
const newItems = [];
|
const lineItems = [];
|
||||||
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||||
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
textItemsGroupedByLine.forEach(lineTextItems => {
|
||||||
var lineItem;
|
const lineItem = lineCompactor.compact(lineTextItems);
|
||||||
if (textItemsOfLine.length == 1) {
|
if (lineTextItems.length > 1) {
|
||||||
lineItem = textItemsOfLine[0];
|
lineItem.annotation = ADDED_ANNOTATION;
|
||||||
const formatType = fontToFormats.get(lineItem.font);
|
lineTextItems.forEach(item => {
|
||||||
if (formatType.needFormat) {
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
lineItem.lineFormat = formatType;
|
lineItems.push(new LineItem({
|
||||||
lineItem.parsedElements = new ParsedElements({
|
...item
|
||||||
completeLineFormats: 1
|
}));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
if (lineItem.words.length == 0) {
|
||||||
textItemsOfLine.forEach(item => {
|
lineItem.annotation = REMOVED_ANNOTATION;
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
}
|
||||||
newItems.push(item);
|
lineItems.push(lineItem);
|
||||||
});
|
|
||||||
|
|
||||||
lineItem = lineCompactor.compact(textItemsOfLine);
|
|
||||||
lineItem.annotation = ADDED_ANNOTATION;
|
|
||||||
|
|
||||||
if (lineItem.parsedElements.footnoteLinks.length > 0) {
|
if (lineItem.parsedElements.footnoteLinks.length > 0) {
|
||||||
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||||
@ -63,15 +56,8 @@ export default class CompactLines extends ToTextItemTransformation {
|
|||||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||||
}
|
}
|
||||||
inlineFormats += lineItem.parsedElements.inlineFormats;
|
|
||||||
}
|
|
||||||
if (lineItem.lineFormat) lineFormats++;
|
|
||||||
if (lineItem.unopenedFormat) unopenedFormats++;
|
|
||||||
if (lineItem.unclosedFormat) unclosedFormats++;
|
|
||||||
lineItem.text = lineItem.text.trim();
|
|
||||||
newItems.push(lineItem);
|
|
||||||
});
|
});
|
||||||
page.items = newItems;
|
page.items = lineItems;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -79,11 +65,8 @@ export default class CompactLines extends ToTextItemTransformation {
|
|||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
messages: [
|
messages: [
|
||||||
'Detected ' + lineFormats + ' line formats',
|
'Detected ' + formattedWords + ' formatted words',
|
||||||
'Detected ' + inlineFormats + ' inline formats',
|
<span>Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }]</span>,
|
||||||
'Detected ' + unclosedFormats + ' opened un-closed formats',
|
|
||||||
'Detected ' + unopenedFormats + ' un-opened closed formats',
|
|
||||||
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
|
|
||||||
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
@ -6,6 +6,8 @@ import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../.
|
|||||||
//Complete unopened/unclosed bold/italic formats
|
//Complete unopened/unclosed bold/italic formats
|
||||||
export default class CompleteFormats extends ToTextItemTransformation {
|
export default class CompleteFormats extends ToTextItemTransformation {
|
||||||
|
|
||||||
|
//TODO move to block and ignore quotes
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Complete Bold/Italics");
|
super("Complete Bold/Italics");
|
||||||
}
|
}
|
||||||
@ -81,7 +83,6 @@ class ItemStack {
|
|||||||
}
|
}
|
||||||
|
|
||||||
consume(item) {
|
consume(item) {
|
||||||
const te = item.text;
|
|
||||||
var newItem;
|
var newItem;
|
||||||
|
|
||||||
const handleFreshUnopened = () => {
|
const handleFreshUnopened = () => {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
@ -6,7 +6,7 @@ import { headlineByLevel } from '../../ElementType.jsx';
|
|||||||
import { isListItem } from '../../../functions.jsx';
|
import { isListItem } from '../../../functions.jsx';
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect items starting with -, •, etc...
|
||||||
export default class DetectHeaders extends ToTextItemTransformation {
|
export default class DetectHeaders extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Headers");
|
super("Detect Headers");
|
||||||
@ -21,15 +21,15 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
||||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||||
pagesWithMaxHeight.forEach(titlePage => {
|
pagesWithMaxHeight.forEach(titlePage => {
|
||||||
titlePage.items.forEach(textItem => {
|
titlePage.items.forEach(item => {
|
||||||
const height = textItem.height;
|
const height = item.height;
|
||||||
if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||||
if (height == maxHeight) {
|
if (height == maxHeight) {
|
||||||
textItem.type = ElementType.H1;
|
item.type = ElementType.H1;
|
||||||
} else {
|
} else {
|
||||||
textItem.type = ElementType.H2;
|
item.type = ElementType.H2;
|
||||||
}
|
}
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
detectedHeaders++;
|
detectedHeaders++;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -41,10 +41,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
var range = headlineTypeToHeightRange[headlineType];
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (!textItem.type && textItem.height == range.max) {
|
if (!item.type && item.height == range.max) {
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
textItem.type = ElementType.enumValueOf(headlineType);
|
item.type = ElementType.enumValueOf(headlineType);
|
||||||
detectedHeaders++
|
detectedHeaders++
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -56,10 +56,10 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
const heights = [];
|
const heights = [];
|
||||||
var lastHeight;
|
var lastHeight;
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
|
if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
|
||||||
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
|
if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
|
||||||
heights.push(textItem.height);
|
heights.push(item.height);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -69,11 +69,11 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
heights.forEach((height, i) => {
|
heights.forEach((height, i) => {
|
||||||
const headlineType = headlineByLevel(2 + i);
|
const headlineType = headlineByLevel(2 + i);
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
|
if (!item.type && item.height == height && !isListItem(item.text())) {
|
||||||
detectedHeaders++;
|
detectedHeaders++;
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
textItem.type = headlineType;
|
item.type = headlineType;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -83,9 +83,9 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
//find headlines which have paragraph height
|
//find headlines which have paragraph height
|
||||||
var smallesHeadlineLevel = 1;
|
var smallesHeadlineLevel = 1;
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (textItem.type && textItem.type.headline) {
|
if (item.type && item.type.headline) {
|
||||||
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, textItem.type.headlineLevel);
|
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -93,18 +93,18 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
|
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
var lastItem;
|
var lastItem;
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (!textItem.type
|
if (!item.type
|
||||||
&& textItem.height == mostUsedHeight
|
&& item.height == mostUsedHeight
|
||||||
&& textItem.font !== mostUsedFont
|
&& item.font !== mostUsedFont
|
||||||
&& (!lastItem || lastItem.y < textItem.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - textItem.y > mostUsedDistance * 2))
|
&& (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
|
||||||
&& textItem.text === textItem.text.toUpperCase()
|
&& item.text() === item.text().toUpperCase()
|
||||||
) {
|
) {
|
||||||
detectedHeaders++;
|
detectedHeaders++;
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
textItem.type = nextHeadlineType;
|
item.type = nextHeadlineType;
|
||||||
}
|
}
|
||||||
lastItem = textItem;
|
lastItem = item;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -124,8 +124,8 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
|||||||
function findPagesWithMaxHeight(pages, maxHeight) {
|
function findPagesWithMaxHeight(pages, maxHeight) {
|
||||||
const maxHeaderPagesSet = new Set();
|
const maxHeaderPagesSet = new Set();
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
if (!textItem.type && textItem.height == maxHeight) {
|
if (!item.type && item.height == maxHeight) {
|
||||||
maxHeaderPagesSet.add(page);
|
maxHeaderPagesSet.add(page);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItem from '../../TextItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect items starting with -, •, etc...
|
||||||
export default class DetectListItems extends ToTextItemTransformation {
|
export default class DetectListItems extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect List Items");
|
super("Detect List Items");
|
||||||
@ -16,34 +16,34 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
var foundListItems = 0;
|
var foundListItems = 0;
|
||||||
var foundNumberedItems = 0;
|
var foundNumberedItems = 0;
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const newItems = [];
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(item => {
|
||||||
newTextItems.push(textItem);
|
newItems.push(item);
|
||||||
if (!textItem.type) {
|
if (!item.type) {
|
||||||
var text = textItem.text;
|
var text = item.text();
|
||||||
if (isListItem(text)) {
|
if (isListItem(text)) {
|
||||||
foundListItems++
|
foundListItems++
|
||||||
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
|
||||||
if (textWithDash === text) {
|
if (textWithDash === text) {
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
textItem.type = ElementType.LIST;
|
item.type = ElementType.LIST;
|
||||||
} else {
|
} else {
|
||||||
textItem.annotation = REMOVED_ANNOTATION;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
newTextItems.push(new TextItem({
|
newItems.push(new LineItem({
|
||||||
...textItem,
|
...item,
|
||||||
text: textWithDash,
|
text: textWithDash,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
type: ElementType.LIST
|
type: ElementType.LIST
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
} else if (isNumberedListItem(text)) {
|
} else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
|
||||||
foundNumberedItems++;
|
foundNumberedItems++;
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
textItem.type = ElementType.LIST;
|
item.type = ElementType.LIST;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.items = newTextItems;
|
page.items = newItems;
|
||||||
});
|
});
|
||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItem from '../../TextItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
|
import Word from '../../Word.jsx';
|
||||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../ElementType.jsx';
|
||||||
import { isDigit, wordMatch } from '../../../functions.jsx'
|
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages
|
//Detect table of contents pages plus linked headlines
|
||||||
export default class DetectTOC extends ToTextItemTransformation {
|
export default class DetectTOC extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect TOC");
|
super("Detect TOC");
|
||||||
@ -17,64 +18,68 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const tocPages = [];
|
const tocPages = [];
|
||||||
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
||||||
|
|
||||||
const linkLeveler = new LinkLeveler();
|
const linkLeveler = new LinkLeveler();
|
||||||
|
|
||||||
|
|
||||||
var tocLinks = [];
|
var tocLinks = [];
|
||||||
var lastTocPage;
|
var lastTocPage;
|
||||||
var headlineItem;
|
var headlineItem;
|
||||||
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||||
const lineItemsWithDigits = [];
|
var lineItemsWithDigits = 0;
|
||||||
const unknownLines = new Set();
|
const unknownLines = new Set();
|
||||||
const pageTocLinks = [];
|
const pageTocLinks = [];
|
||||||
var lastLineTextWithoutNumber;
|
var lastWordsWithoutNumber;
|
||||||
var lastLine;
|
var lastLine;
|
||||||
|
//find lines ending with a number per page
|
||||||
page.items.forEach(line => {
|
page.items.forEach(line => {
|
||||||
var lineText = line.text.replace(/\./g, '').trim();
|
var words = line.words.filter(word => !hasOnly(word.string, '.'));
|
||||||
var endsWithDigit = false;
|
const digits = [];
|
||||||
var digits = [];
|
while (words.length > 0 && isNumber(words[words.length - 1].string)) {
|
||||||
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
const lastWord = words.pop();
|
||||||
digits.unshift(lineText.charAt(lineText.length - 1));
|
digits.unshift(lastWord.string);
|
||||||
lineText = lineText.substring(0, lineText.length - 1);
|
|
||||||
endsWithDigit = true;
|
|
||||||
}
|
}
|
||||||
lineText = lineText.trim();
|
|
||||||
|
if (digits.length == 0 && words.length > 0) {
|
||||||
|
const lastWord = words[words.length - 1];
|
||||||
|
while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
|
||||||
|
digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
|
||||||
|
lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var endsWithDigit = digits.length > 0;
|
||||||
if (endsWithDigit) {
|
if (endsWithDigit) {
|
||||||
endsWithDigit = true;
|
endsWithDigit = true;
|
||||||
if (lastLineTextWithoutNumber) { // 2-line item ?
|
if (lastWordsWithoutNumber) { // 2-line item ?
|
||||||
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
words.push(...lastWordsWithoutNumber);
|
||||||
lastLineTextWithoutNumber = null;
|
lastWordsWithoutNumber = null;
|
||||||
}
|
}
|
||||||
pageTocLinks.push(new TocLink({
|
pageTocLinks.push(new TocLink({
|
||||||
pageNumber: parseInt(digits.join('')),
|
pageNumber: parseInt(digits.join('')),
|
||||||
textItem: new TextItem({
|
lineItem: new LineItem({
|
||||||
...line,
|
...line,
|
||||||
text: lineText
|
words: words
|
||||||
})
|
})
|
||||||
}));
|
}));
|
||||||
lineItemsWithDigits.push(new TextItem({
|
lineItemsWithDigits++;
|
||||||
...line,
|
|
||||||
text: lineText
|
|
||||||
}));
|
|
||||||
lastLineTextWithoutNumber = null;
|
|
||||||
} else {
|
} else {
|
||||||
if (!headlineItem) {
|
if (!headlineItem) {
|
||||||
headlineItem = line;
|
headlineItem = line;
|
||||||
} else {
|
} else {
|
||||||
if (lastLineTextWithoutNumber) {
|
if (lastWordsWithoutNumber) {
|
||||||
unknownLines.add(lastLine);
|
unknownLines.add(lastLine);
|
||||||
}
|
}
|
||||||
lastLineTextWithoutNumber = lineText;
|
lastWordsWithoutNumber = words;
|
||||||
lastLine = line;
|
lastLine = line;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// page has been processed
|
// page has been processed
|
||||||
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
|
if (lineItemsWithDigits * 100 / page.items.length > 75) {
|
||||||
tocPages.push(page.index + 1);
|
tocPages.push(page.index + 1);
|
||||||
lastTocPage = page;
|
lastTocPage = page;
|
||||||
linkLeveler.levelPageItems(pageTocLinks);
|
linkLeveler.levelPageItems(pageTocLinks);
|
||||||
tocLinks = tocLinks.concat(pageTocLinks);
|
tocLinks.push(...pageTocLinks);
|
||||||
|
|
||||||
const newBlocks = [];
|
const newBlocks = [];
|
||||||
page.items.forEach((line) => {
|
page.items.forEach((line) => {
|
||||||
@ -83,7 +88,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
}
|
}
|
||||||
newBlocks.push(line);
|
newBlocks.push(line);
|
||||||
if (line === headlineItem) {
|
if (line === headlineItem) {
|
||||||
newBlocks.push(new TextItem({
|
newBlocks.push(new LineItem({
|
||||||
...line,
|
...line,
|
||||||
type: ElementType.H2,
|
type: ElementType.H2,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
@ -105,8 +110,10 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
if (tocPages.length > 0) {
|
if (tocPages.length > 0) {
|
||||||
// Add TOC items
|
// Add TOC items
|
||||||
tocLinks.forEach(tocLink => {
|
tocLinks.forEach(tocLink => {
|
||||||
lastTocPage.items.push(new TextItem({
|
lastTocPage.items.push(new LineItem({
|
||||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
words: [new Word({
|
||||||
|
string: ' '.repeat(tocLink.level * 3) + '-'
|
||||||
|
})].concat(tocLink.lineItem.words),
|
||||||
type: ElementType.TOC,
|
type: ElementType.TOC,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
@ -118,11 +125,11 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
|
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
|
||||||
var foundHealineItems;
|
var foundHealineItems;
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
||||||
if (!foundHealineItems) { // pages are off by 1 ?
|
if (!foundHealineItems) { // pages are off by 1 ?
|
||||||
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
|
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
|
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,11 +149,16 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
const headlineType = headlineByLevel(notFoundTocLink.level + 2);
|
||||||
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
const heightRange = headlineTypeToHeightRange[headlineType.name];
|
||||||
if (heightRange) {
|
if (heightRange) {
|
||||||
const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
|
||||||
if (textItem) {
|
if (lineIndex > -1) {
|
||||||
textItem.type = headlineType;
|
const page = parseResult.pages[pageIndex];
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
page.items[lineIndex].annotation = REMOVED_ANNOTATION;
|
||||||
foundBySize.push(textItem.text);
|
page.items.splice(lineIndex + 1, 0, new LineItem({
|
||||||
|
...notFoundTocLink.lineItem,
|
||||||
|
type: headlineType,
|
||||||
|
annotation: ADDED_ANNOTATION,
|
||||||
|
}));
|
||||||
|
foundBySize.push(notFoundTocLink);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -173,12 +185,12 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
const messages = [];
|
const messages = [];
|
||||||
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
||||||
if (tocPages.length > 0) {
|
if (tocPages.length > 0) {
|
||||||
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
|
||||||
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
|
||||||
|
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
|
||||||
}
|
}
|
||||||
if (notFoundHeadlines.length > 0) {
|
if (notFoundHeadlines.length > 0) {
|
||||||
messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()));
|
||||||
messages.push('Found TOC headlines (by size): ' + foundBySize);
|
messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber));
|
||||||
}
|
}
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
@ -196,7 +208,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
|||||||
//Find out how the TOC page link actualy translates to the page.index
|
//Find out how the TOC page link actualy translates to the page.index
|
||||||
function detectPageMappingNumber(pages, tocLinks) {
|
function detectPageMappingNumber(pages, tocLinks) {
|
||||||
for ( var tocLink of tocLinks ) {
|
for ( var tocLink of tocLinks ) {
|
||||||
const page = findPageWithHeadline(pages, tocLink.textItem.text);
|
const page = findPageWithHeadline(pages, tocLink.lineItem.text());
|
||||||
if (page) {
|
if (page) {
|
||||||
return page.index - tocLink.pageNumber;
|
return page.index - tocLink.pageNumber;
|
||||||
}
|
}
|
||||||
@ -235,9 +247,9 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
|||||||
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||||
const headlineType = headlineByLevel(tocLink.level + 2);
|
const headlineType = headlineByLevel(tocLink.level + 2);
|
||||||
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
|
||||||
page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({
|
page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
|
||||||
...foundItems.headlineItems[0],
|
...foundItems.headlineItems[0],
|
||||||
text: tocLink.textItem.text,
|
words: tocLink.lineItem.words,
|
||||||
height: headlineHeight,
|
height: headlineHeight,
|
||||||
type: headlineType,
|
type: headlineType,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
@ -255,21 +267,22 @@ function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
|
function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) {
|
||||||
|
const linkText = tocLink.lineItem.text().toUpperCase();
|
||||||
for (var i = fromPage; i <= toPage; i++) {
|
for (var i = fromPage; i <= toPage; i++) {
|
||||||
const page = pages[i - 1];
|
const page = pages[i - 1];
|
||||||
for ( var line of page.items ) {
|
const lineIndex = page.items.findIndex(line => {
|
||||||
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
|
||||||
const match = wordMatch(tocLink.textItem.text, line.text);
|
const match = wordMatch(linkText, line.text());
|
||||||
if (match >= 0.5) {
|
return match >= 0.5;
|
||||||
return line;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
if (lineIndex > -1) return [i - 1, lineIndex];
|
||||||
}
|
}
|
||||||
|
return [-1, -1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class LinkLeveler {
|
class LinkLeveler {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.levelByMethod = null;
|
this.levelByMethod = null;
|
||||||
@ -297,13 +310,13 @@ class LinkLeveler {
|
|||||||
levelByXDiff(tocLinks) {
|
levelByXDiff(tocLinks) {
|
||||||
const uniqueX = this.calculateUniqueX(tocLinks);
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
||||||
tocLinks.forEach(link => {
|
tocLinks.forEach(link => {
|
||||||
link.level = uniqueX.indexOf(link.textItem.x);
|
link.level = uniqueX.indexOf(link.lineItem.x);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
levelByFont(tocLinks) {
|
levelByFont(tocLinks) {
|
||||||
tocLinks.forEach(link => {
|
tocLinks.forEach(link => {
|
||||||
link.level = this.uniqueFonts.indexOf(link.textItem.font);
|
link.level = this.uniqueFonts.indexOf(link.lineItem.font);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -315,7 +328,7 @@ class LinkLeveler {
|
|||||||
|
|
||||||
calculateUniqueX(tocLinks) {
|
calculateUniqueX(tocLinks) {
|
||||||
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
||||||
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
|
if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x);
|
||||||
return uniquesArray;
|
return uniquesArray;
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@ -328,7 +341,7 @@ class LinkLeveler {
|
|||||||
|
|
||||||
calculateUniqueFonts(tocLinks) {
|
calculateUniqueFonts(tocLinks) {
|
||||||
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
||||||
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
|
if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font);
|
||||||
return uniquesArray;
|
return uniquesArray;
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@ -339,7 +352,7 @@ class LinkLeveler {
|
|||||||
|
|
||||||
class TocLink {
|
class TocLink {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.textItem = options.textItem;
|
this.lineItem = options.lineItem;
|
||||||
this.pageNumber = options.pageNumber;
|
this.pageNumber = options.pageNumber;
|
||||||
this.level = 0;
|
this.level = 0;
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
|
|||||||
|
|
||||||
|
|
||||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||||
export default class RemoveRepetitiveElements extends ToTextItemTransformation {
|
export default class RemoveRepetitiveElements extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Remove Repetitive Elements");
|
super("Remove Repetitive Elements");
|
||||||
@ -58,8 +58,8 @@ export default class RemoveRepetitiveElements extends ToTextItemTransformation {
|
|||||||
maxElements: []
|
maxElements: []
|
||||||
});
|
});
|
||||||
|
|
||||||
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
|
||||||
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text().toUpperCase(), ''));
|
||||||
pageStore.push({
|
pageStore.push({
|
||||||
minElements: minMaxItems.minElements,
|
minElements: minMaxItems.minElements,
|
||||||
maxElements: minMaxItems.maxElements,
|
maxElements: minMaxItems.maxElements,
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItem from '../../TextItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
|
import StashingStream from '../../StashingStream.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
// Converts vertical text to horizontal
|
// Converts vertical text to horizontal
|
||||||
export default class VerticalToHorizontal extends ToTextItemTransformation {
|
export default class VerticalToHorizontal extends ToLineItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Vertical to Horizontal Text");
|
super("Vertical to Horizontal Text");
|
||||||
@ -12,87 +13,64 @@ export default class VerticalToHorizontal extends ToTextItemTransformation {
|
|||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
var foundVerticals = 0;
|
var foundVerticals = 0;
|
||||||
const newPages = parseResult.pages.map(page => {
|
parseResult.pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const stream = new VerticalsStream();
|
||||||
// var oneCharacterItems = [];
|
stream.consumeAll(page.items);
|
||||||
|
page.items = stream.complete();
|
||||||
// const applyTransformation = () => {
|
foundVerticals += stream.foundVerticals;
|
||||||
// oneCharacterItems.forEach(item => {
|
|
||||||
// item.annotation = REMOVED_ANNOTATION;
|
|
||||||
// newTextItems.push(item);
|
|
||||||
// //TODO add new
|
|
||||||
// });
|
|
||||||
// oneCharacterItems = [];
|
|
||||||
// };
|
|
||||||
// const rollbackTransformation = () => {
|
|
||||||
// oneCharacterItems.forEach(item => {
|
|
||||||
// newTextItems.push(item);
|
|
||||||
// });
|
|
||||||
// oneCharacterItems = [];
|
|
||||||
// };
|
|
||||||
|
|
||||||
//TODO generic state machine code ?
|
|
||||||
|
|
||||||
const leftOver = page.items.reduce((oneCharacterItems, item) => {
|
|
||||||
if (item.text.trim().length == 1) {
|
|
||||||
if (oneCharacterItems.length == 0) {
|
|
||||||
oneCharacterItems.push(item);
|
|
||||||
} else {
|
|
||||||
const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
|
|
||||||
if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
|
|
||||||
oneCharacterItems.push(item);
|
|
||||||
} else {
|
|
||||||
if (oneCharacterItems.length > 5) {
|
|
||||||
var combinedText = '';
|
|
||||||
var minX = 999;
|
|
||||||
var maxY = 0;
|
|
||||||
var sumWidth = 0;
|
|
||||||
var maxHeight = 0;
|
|
||||||
oneCharacterItems.forEach(oneCharacterItem => {
|
|
||||||
oneCharacterItem.annotation = REMOVED_ANNOTATION;
|
|
||||||
newTextItems.push(oneCharacterItem);
|
|
||||||
combinedText += oneCharacterItem.text.trim();
|
|
||||||
minX = Math.min(minX, oneCharacterItem.x);
|
|
||||||
maxY = Math.max(maxY, oneCharacterItem.y);
|
|
||||||
sumWidth += oneCharacterItem.width;
|
|
||||||
maxHeight = Math.max(maxHeight, oneCharacterItem.height);
|
|
||||||
});
|
});
|
||||||
newTextItems.push(new TextItem({
|
|
||||||
...oneCharacterItems[0],
|
|
||||||
x: minX,
|
|
||||||
y: maxY,
|
|
||||||
width: sumWidth,
|
|
||||||
height: maxHeight,
|
|
||||||
text: combinedText,
|
|
||||||
annotation: ADDED_ANNOTATION
|
|
||||||
}));
|
|
||||||
foundVerticals++;
|
|
||||||
} else {
|
|
||||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
|
||||||
}
|
|
||||||
oneCharacterItems = [item];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
|
||||||
oneCharacterItems = [];
|
|
||||||
newTextItems.push(item);
|
|
||||||
}
|
|
||||||
return oneCharacterItems;
|
|
||||||
}, []);
|
|
||||||
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
|
||||||
|
|
||||||
return {
|
|
||||||
...page,
|
|
||||||
items: newTextItems
|
|
||||||
};
|
|
||||||
});
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
pages: newPages,
|
|
||||||
messages: ["Converted " + foundVerticals + " verticals"]
|
messages: ["Converted " + foundVerticals + " verticals"]
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class VerticalsStream extends StashingStream {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.foundVerticals = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStash(item) {
|
||||||
|
return item.words.length == 1 && item.words[0].string.length == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
doMatchesStash(lastItem, item) {
|
||||||
|
return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type;
|
||||||
|
}
|
||||||
|
|
||||||
|
doFlushStash(stash, results) {
|
||||||
|
if (stash.length > 5) { // unite
|
||||||
|
var combinedWords = [];
|
||||||
|
var minX = 999;
|
||||||
|
var maxY = 0;
|
||||||
|
var sumWidth = 0;
|
||||||
|
var maxHeight = 0;
|
||||||
|
stash.forEach(oneCharacterLine => {
|
||||||
|
oneCharacterLine.annotation = REMOVED_ANNOTATION;
|
||||||
|
results.push(oneCharacterLine);
|
||||||
|
combinedWords.push(oneCharacterLine.words[0]);
|
||||||
|
minX = Math.min(minX, oneCharacterLine.x);
|
||||||
|
maxY = Math.max(maxY, oneCharacterLine.y);
|
||||||
|
sumWidth += oneCharacterLine.width;
|
||||||
|
maxHeight = Math.max(maxHeight, oneCharacterLine.height);
|
||||||
|
});
|
||||||
|
results.push(new LineItem({
|
||||||
|
...stash[0],
|
||||||
|
x: minX,
|
||||||
|
y: maxY,
|
||||||
|
width: sumWidth,
|
||||||
|
height: maxHeight,
|
||||||
|
words: combinedWords,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
this.foundVerticals++;
|
||||||
|
} else { //add as singles
|
||||||
|
results.push(...stash);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
|
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { minXFromBlocks } from '../../../textItemFunctions.jsx';
|
import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
|
||||||
|
|
||||||
//Detect items which are code/quote blocks
|
//Detect items which are code/quote blocks
|
||||||
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
|
export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Code/Quote Blocks");
|
super("Detect Code/Quote Blocks");
|
||||||
@ -17,7 +17,7 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
|
|||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
var minX = minXFromBlocks(page.items);
|
var minX = minXFromBlocks(page.items);
|
||||||
page.items.forEach(block => {
|
page.items.forEach(block => {
|
||||||
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
|
if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
|
||||||
block.annotation = DETECTED_ANNOTATION;
|
block.annotation = DETECTED_ANNOTATION;
|
||||||
block.type = ElementType.CODE;
|
block.type = ElementType.CODE;
|
||||||
foundCodeItems++;
|
foundCodeItems++;
|
||||||
@ -36,14 +36,14 @@ export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
|
function looksLikeCodeBlock(minX, items, mostUsedHeight) {
|
||||||
if (textItems.length == 0) {
|
if (items.length == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (textItems.length == 1) {
|
if (items.length == 1) {
|
||||||
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
|
return items[0].x > minX && items[0].height <= mostUsedHeight + 1;
|
||||||
}
|
}
|
||||||
for ( var item of textItems ) {
|
for ( var item of items ) {
|
||||||
if (item.x == minX) {
|
if (item.x == minX) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
import ToTextItemBlockTransformation from '..//ToTextItemBlockTransformation.jsx';
|
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
|
import Word from '../../Word.jsx';
|
||||||
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
|
|
||||||
// Cares for proper sub-item spacing/leveling
|
// Cares for proper sub-item spacing/leveling
|
||||||
export default class DetectListLevels extends ToTextItemBlockTransformation {
|
export default class DetectListLevels extends ToLineItemBlockTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Level Lists");
|
super("Level Lists");
|
||||||
@ -21,23 +22,25 @@ export default class DetectListLevels extends ToTextItemBlockTransformation {
|
|||||||
var currentLevel = 0;
|
var currentLevel = 0;
|
||||||
const xByLevel = {};
|
const xByLevel = {};
|
||||||
var modifiedBlock = false;
|
var modifiedBlock = false;
|
||||||
listBlock.textItems.forEach(textItem => {
|
listBlock.items.forEach(item => {
|
||||||
const isListItem = true;
|
const isListItem = true;
|
||||||
if (lastItemX && isListItem) {
|
if (lastItemX && isListItem) {
|
||||||
if (textItem.x > lastItemX) {
|
if (item.x > lastItemX) {
|
||||||
currentLevel++;
|
currentLevel++;
|
||||||
xByLevel[textItem.x] = currentLevel;
|
xByLevel[item.x] = currentLevel;
|
||||||
} else if (textItem.x < lastItemX) {
|
} else if (item.x < lastItemX) {
|
||||||
currentLevel = xByLevel[textItem.x];
|
currentLevel = xByLevel[item.x];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
xByLevel[textItem.x] = 0;
|
xByLevel[item.x] = 0;
|
||||||
}
|
}
|
||||||
if (currentLevel > 0) {
|
if (currentLevel > 0) {
|
||||||
textItem.text = ' '.repeat(currentLevel * 3) + textItem.text;
|
item.words = [new Word({
|
||||||
|
string: ' '.repeat(currentLevel * 3)
|
||||||
|
})].concat(item.words);
|
||||||
modifiedBlock = true;
|
modifiedBlock = true;
|
||||||
}
|
}
|
||||||
lastItemX = textItem.x;
|
lastItemX = item.x;
|
||||||
});
|
});
|
||||||
listBlocks++;
|
listBlocks++;
|
||||||
if (modifiedBlock) {
|
if (modifiedBlock) {
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import ToTextItemBlockTransformation from '../ToTextItemBlockTransformation.jsx';
|
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItemBlock from '../../TextItemBlock.jsx';
|
import LineItemBlock from '../../LineItemBlock.jsx';
|
||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import { minXFromTextItems } from '../../../textItemFunctions.jsx';
|
import { minXFromPageItems } from '../../../pageItemFunctions.jsx';
|
||||||
|
|
||||||
// Gathers lines to blocks
|
// Gathers lines to blocks
|
||||||
export default class GatherBlocks extends ToTextItemBlockTransformation {
|
export default class GatherBlocks extends ToLineItemBlockTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Gather Blocks");
|
super("Gather Blocks");
|
||||||
@ -14,29 +14,29 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
|
|||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {mostUsedDistance} = parseResult.globals;
|
const {mostUsedDistance} = parseResult.globals;
|
||||||
var createdBlocks = 0;
|
var createdBlocks = 0;
|
||||||
var textItems = 0;
|
var lineItemCount = 0;
|
||||||
parseResult.pages.map(page => {
|
parseResult.pages.map(page => {
|
||||||
textItems += page.items.length;
|
lineItemCount += page.items.length;
|
||||||
const blocks = [];
|
const blocks = [];
|
||||||
var stashedBlock = new TextItemBlock({});
|
var stashedBlock = new LineItemBlock({});
|
||||||
const flushStashedItems = () => {
|
const flushStashedItems = () => {
|
||||||
if (stashedBlock.textItems.length > 1) {
|
if (stashedBlock.items.length > 1) {
|
||||||
stashedBlock.annotation = DETECTED_ANNOTATION;
|
stashedBlock.annotation = DETECTED_ANNOTATION;
|
||||||
}
|
}
|
||||||
|
|
||||||
blocks.push(stashedBlock);
|
blocks.push(stashedBlock);
|
||||||
stashedBlock = new TextItemBlock({});
|
stashedBlock = new LineItemBlock({});
|
||||||
createdBlocks++;
|
createdBlocks++;
|
||||||
};
|
};
|
||||||
|
|
||||||
var minX = minXFromTextItems(page.items);
|
var minX = minXFromPageItems(page.items);
|
||||||
page.items.forEach(item => {
|
page.items.forEach(item => {
|
||||||
if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
||||||
flushStashedItems();
|
flushStashedItems();
|
||||||
}
|
}
|
||||||
stashedBlock.addTextItem(item);
|
stashedBlock.addItem(item);
|
||||||
});
|
});
|
||||||
if (stashedBlock.textItems.length > 0) {
|
if (stashedBlock.items.length > 0) {
|
||||||
flushStashedItems();
|
flushStashedItems();
|
||||||
}
|
}
|
||||||
page.items = blocks;
|
page.items = blocks;
|
||||||
@ -44,7 +44,7 @@ export default class GatherBlocks extends ToTextItemBlockTransformation {
|
|||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
|
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items']
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
|||||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
const lastItem = stashedBlock.items[stashedBlock.items.length - 1];
|
||||||
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
|
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
|
||||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
|
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
import TextItemBlock from './models/TextItemBlock.jsx';
|
import PageItem from './models/PageItem.jsx';
|
||||||
import TextItem from './models/TextItem.jsx';
|
import LineItemBlock from './models/LineItemBlock.jsx';
|
||||||
|
|
||||||
export function minXFromBlocks(blocks:TextItemBlock[]) {
|
export function minXFromBlocks(blocks:LineItemBlock[]) {
|
||||||
var minX = 999;
|
var minX = 999;
|
||||||
blocks.forEach(block => {
|
blocks.forEach(block => {
|
||||||
block.textItems.forEach(item => {
|
block.items.forEach(item => {
|
||||||
minX = Math.min(minX, item.x)
|
minX = Math.min(minX, item.x)
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -14,7 +14,7 @@ export function minXFromBlocks(blocks:TextItemBlock[]) {
|
|||||||
return minX;
|
return minX;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function minXFromTextItems(items:TextItem) {
|
export function minXFromPageItems(items:PageItem) {
|
||||||
var minX = 999;
|
var minX = 999;
|
||||||
items.forEach(item => {
|
items.forEach(item => {
|
||||||
minX = Math.min(minX, item.x)
|
minX = Math.min(minX, item.x)
|
||||||
@ -25,13 +25,13 @@ export function minXFromTextItems(items:TextItem) {
|
|||||||
return minX;
|
return minX;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function sortByX(items:TextItem) {
|
export function sortByX(items:PageItem) {
|
||||||
items.sort((a, b) => {
|
items.sort((a, b) => {
|
||||||
return a.x - b.x;
|
return a.x - b.x;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export function sortCopyByX(items:TextItem) {
|
export function sortCopyByX(items:PageItem) {
|
||||||
const copy = items.concat();
|
const copy = items.concat();
|
||||||
sortByX(copy);
|
sortByX(copy);
|
||||||
return copy;
|
return copy;
|
@ -1,31 +1,30 @@
|
|||||||
import { expect } from 'chai';
|
import { expect } from 'chai';
|
||||||
|
|
||||||
import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
|
import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
|
||||||
import TextItem from '../src/javascript/models/TextItem.jsx';
|
import LineItem from '../src/javascript/models/LineItem.jsx';
|
||||||
|
|
||||||
describe('HeadlineFinder', () => {
|
describe('HeadlineFinder', () => {
|
||||||
|
|
||||||
|
|
||||||
it('Not Found - Case 1', () => {
|
it('Not Found - Case 1', () => {
|
||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: 'My Little Headline'
|
headline: 'My Little Headline'
|
||||||
});
|
});
|
||||||
const item1 = new TextItem({
|
const item1 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item2 = new TextItem({
|
const item2 = new LineItem({
|
||||||
text: 'Little'
|
text: 'Little'
|
||||||
});
|
});
|
||||||
const item3 = new TextItem({
|
const item3 = new LineItem({
|
||||||
text: ' Headline2'
|
text: ' Headline2'
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
expect(headlineFinder.consume(item3)).to.equal(null);
|
expect(headlineFinder.consume(item3)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -33,22 +32,22 @@ describe('HeadlineFinder', () => {
|
|||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: 'My Little Headline'
|
headline: 'My Little Headline'
|
||||||
});
|
});
|
||||||
const item1 = new TextItem({
|
const item1 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item2 = new TextItem({
|
const item2 = new LineItem({
|
||||||
text: 'Little'
|
text: 'Little'
|
||||||
});
|
});
|
||||||
const item3 = new TextItem({
|
const item3 = new LineItem({
|
||||||
text: ' Headline'
|
text: ' Headline'
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -56,27 +55,27 @@ describe('HeadlineFinder', () => {
|
|||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: 'My Little Headline'
|
headline: 'My Little Headline'
|
||||||
});
|
});
|
||||||
const item0 = new TextItem({
|
const item0 = new LineItem({
|
||||||
text: 'Waste '
|
text: 'Waste '
|
||||||
});
|
});
|
||||||
const item1 = new TextItem({
|
const item1 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item2 = new TextItem({
|
const item2 = new LineItem({
|
||||||
text: 'Little'
|
text: 'Little'
|
||||||
});
|
});
|
||||||
const item3 = new TextItem({
|
const item3 = new LineItem({
|
||||||
text: ' Headline'
|
text: ' Headline'
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(headlineFinder.consume(item0)).to.equal(null);
|
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(0);
|
||||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -84,27 +83,27 @@ describe('HeadlineFinder', () => {
|
|||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: 'My Little Headline'
|
headline: 'My Little Headline'
|
||||||
});
|
});
|
||||||
const item0 = new TextItem({
|
const item0 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item1 = new TextItem({
|
const item1 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item2 = new TextItem({
|
const item2 = new LineItem({
|
||||||
text: 'Little'
|
text: 'Little'
|
||||||
});
|
});
|
||||||
const item3 = new TextItem({
|
const item3 = new LineItem({
|
||||||
text: ' Headline'
|
text: ' Headline'
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(headlineFinder.consume(item0)).to.equal(null);
|
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item0);
|
||||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -112,22 +111,22 @@ describe('HeadlineFinder', () => {
|
|||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: 'MYLitt le HEADline'
|
headline: 'MYLitt le HEADline'
|
||||||
});
|
});
|
||||||
const item1 = new TextItem({
|
const item1 = new LineItem({
|
||||||
text: 'My '
|
text: 'My '
|
||||||
});
|
});
|
||||||
const item2 = new TextItem({
|
const item2 = new LineItem({
|
||||||
text: 'Little'
|
text: 'Little'
|
||||||
});
|
});
|
||||||
const item3 = new TextItem({
|
const item3 = new LineItem({
|
||||||
text: ' Headline'
|
text: ' Headline'
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(headlineFinder.consume(item1)).to.equal(null);
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
expect(headlineFinder.consume(item2)).to.equal(null);
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
expect(headlineFinder.stackedLineItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -2,9 +2,10 @@ import { expect } from 'chai';
|
|||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||||
|
|
||||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||||
|
|
||||||
it('single word', () => {
|
it('single word', () => {
|
||||||
|
|
||||||
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
|
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
|
||||||
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
|
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
|
||||||
|
|
||||||
@ -38,7 +39,7 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('removeLeadingWhitespaces', () => {
|
describe('functions: removeLeadingWhitespaces', () => {
|
||||||
it('No Removes', () => {
|
it('No Removes', () => {
|
||||||
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
|
expect(removeLeadingWhitespaces(".")).to.be.equal(".");
|
||||||
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
|
expect(removeLeadingWhitespaces(". ")).to.be.equal(". ");
|
||||||
@ -54,7 +55,7 @@ describe('removeLeadingWhitespaces', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('removeTrailingWhitespaces', () => {
|
describe('functions: removeTrailingWhitespaces', () => {
|
||||||
it('No Removes', () => {
|
it('No Removes', () => {
|
||||||
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
|
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
|
||||||
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
|
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
|
||||||
@ -71,7 +72,7 @@ describe('removeTrailingWhitespaces', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
describe('prefixAfterWhitespace', () => {
|
describe('functions: prefixAfterWhitespace', () => {
|
||||||
it('Basic', () => {
|
it('Basic', () => {
|
||||||
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
|
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
|
||||||
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
|
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
|
||||||
@ -81,7 +82,7 @@ describe('prefixAfterWhitespace', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('suffixBeforeWhitespace', () => {
|
describe('functions: suffixBeforeWhitespace', () => {
|
||||||
it('Basic', () => {
|
it('Basic', () => {
|
||||||
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||||
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
|
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
|
||||||
@ -92,7 +93,7 @@ describe('suffixBeforeWhitespace', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
describe('charCodeArray', () => {
|
describe('functions: charCodeArray', () => {
|
||||||
it('Charcodes', () => {
|
it('Charcodes', () => {
|
||||||
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
|
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
|
||||||
});
|
});
|
||||||
@ -105,7 +106,7 @@ describe('charCodeArray', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('normalizedCharCodeArray', () => {
|
describe('functions: normalizedCharCodeArray', () => {
|
||||||
|
|
||||||
it('No Change', () => {
|
it('No Change', () => {
|
||||||
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
|
||||||
@ -131,7 +132,7 @@ describe('normalizedCharCodeArray', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('isListItem', () => {
|
describe('functions: isListItem', () => {
|
||||||
|
|
||||||
it('Match', () => {
|
it('Match', () => {
|
||||||
expect(isListItem('- my text')).to.equal(true);
|
expect(isListItem('- my text')).to.equal(true);
|
||||||
@ -154,7 +155,7 @@ describe('isListItem', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('isNumberedListItem', () => {
|
describe('functions: isNumberedListItem', () => {
|
||||||
|
|
||||||
it('Match', () => {
|
it('Match', () => {
|
||||||
expect(isNumberedListItem('1. my text')).to.equal(true);
|
expect(isNumberedListItem('1. my text')).to.equal(true);
|
||||||
@ -173,7 +174,7 @@ describe('isNumberedListItem', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('wordsMatch', () => {
|
describe('functions: wordsMatch', () => {
|
||||||
|
|
||||||
it('Match', () => {
|
it('Match', () => {
|
||||||
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
|
expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
|
||||||
|
64
test/models/StashingStream.spec.js
Normal file
64
test/models/StashingStream.spec.js
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import { expect } from 'chai';
|
||||||
|
|
||||||
|
import StashingStream from '../../src/javascript/models/StashingStream';
|
||||||
|
import TextItem from '../../src/javascript/models/TextItem.jsx';
|
||||||
|
|
||||||
|
describe('StashingStream', () => {
|
||||||
|
|
||||||
|
it('Simple', () => {
|
||||||
|
const stream = new MyStashingStream();
|
||||||
|
|
||||||
|
stream.consume('a');
|
||||||
|
stream.consume('b');
|
||||||
|
stream.consume('a');
|
||||||
|
stream.consume('a');
|
||||||
|
stream.consume('z');
|
||||||
|
stream.consume('m');
|
||||||
|
stream.consume('m');
|
||||||
|
stream.consume('z');
|
||||||
|
stream.consume('z');
|
||||||
|
stream.consume('c');
|
||||||
|
stream.consume('e');
|
||||||
|
stream.consume('f');
|
||||||
|
stream.consume('m');
|
||||||
|
stream.consume('a');
|
||||||
|
|
||||||
|
const resultsAsString = stream.complete().join('');
|
||||||
|
|
||||||
|
expect(resultsAsString).to.equal('AbAAZZZcefA');
|
||||||
|
expect(stream.transformedItems).to.equal(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ConsumeAll', () => {
|
||||||
|
const items = ['k', 'k', 'x', 'a', 'm', 'z', 'o', 'p']
|
||||||
|
const stream = new MyStashingStream();
|
||||||
|
stream.consumeAll(items);
|
||||||
|
|
||||||
|
const resultsAsString = stream.complete().join('');
|
||||||
|
expect(resultsAsString).to.equal('kkxAZop');
|
||||||
|
expect(stream.transformedItems).to.equal(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
class MyStashingStream extends StashingStream {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.transformedItems = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStash(item) {
|
||||||
|
return item === 'a' || item === 'z' || item === 'm';
|
||||||
|
}
|
||||||
|
|
||||||
|
doMatchesStash(lastItem, item) {
|
||||||
|
return lastItem === item;
|
||||||
|
}
|
||||||
|
|
||||||
|
doFlushStash(stash, results) {
|
||||||
|
this.transformedItems += stash.length;
|
||||||
|
results.push(...stash.filter(elem => elem !== 'm').map(item => item.toUpperCase()));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user