WIP initial quote/code detector with new TextItemCombiner

This commit is contained in:
Johannes Zillmann 2017-02-18 10:50:54 +01:00
parent d78d9be8a3
commit f93d1e4aa1
9 changed files with 182 additions and 19 deletions

View File

@ -12,25 +12,36 @@ export default class PdfBlockPageView extends React.Component {
render() {
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
var numberOfNonEmptyBlocks = 0;
const blockTables = pdfPage.blocks.map((block, i) => {
var blocks = pdfPage.blocks;
if (modificationsOnly) {
blocks = blocks.filter(block => block.annotation);
}
const blockTables = blocks.map((block, i) => {
var textItems = block.textItems;
if (modificationsOnly) {
textItems = textItems.filter(item => item.annotation);
}
if (textItems.length == 0 && modificationsOnly) {
return <div key={ i } />
} else {
numberOfNonEmptyBlocks++;
return <div key={ i }>
<h4>Block { i + i }</h4>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
</div>
}
const blockType = block.type ? ' - ' + block.type : null;
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
: null;
const borderStyle = block.annotation ? {
marginBottom: "20px",
border: "solid thin " + block.annotation.color
} : null;
const colorStyle = block.annotation ? {
color: block.annotation.color
} : null;
return <div key={ i }>
<div style={ colorStyle }>
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
</div>
<div style={ borderStyle }>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
</div>
</div>
});
var content;
if (numberOfNonEmptyBlocks == 0 && modificationsOnly) {
if (blocks.length == 0 && modificationsOnly) {
content = <div/>
} else {
const header = "Page " + (pdfPage.index + 1);

View File

@ -81,7 +81,7 @@ export default class TextItemTable extends React.Component {
)
return (
<Table responsive>
<Table responsive bordered>
{ tableHeader }
<tbody>
{ textItemRows }

View File

@ -29,7 +29,7 @@ export default class AppState {
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectPdfBlocks(),
// new DetectCodeBlocks(),
new DetectCodeBlocks(),
// new DetectFormats(),
// new CombineSameY(),
// new RemoveWhitespaces(),

View File

@ -3,6 +3,8 @@ export default class PdfBlock {
constructor(options) {
this.textItems = options.textItems;
this.type = options.type;
this.annotation = options.annotation;
}
}

View File

@ -1,4 +1,4 @@
//A text iteme, i.e. a line, within a page
//A text item, i.e. a line or a word within a page
export default class TextItem {
constructor(options) {

View File

@ -0,0 +1,56 @@
import TextItem from './TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from './Annotation.jsx';
//Combines text items which are on the same Y at the same time doing inline transformations like 'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemCombiner {
constructor(options) {
this.transformEmphasis = options.transformEmphasis || true;
console.debug(this.transformEmphasis);
}
// returns a TextItem array new items
combine(textItems: TextItem[]) {
const resultItems = [];
const groupedItems = groupByFollowingY(textItems);
groupedItems.forEach(itemGroup => {
if (itemGroup.length == 1) {
resultItems.push(itemGroup[0]);
} else {
var text = '';
itemGroup.forEach(item => {
// item.annotation = REMOVED_ANNOTATION;
// resultItems.push(item);
text += item.text;
});
//TODO set other elements
resultItems.push(new TextItem({
text: text,
}));
}
});
//TODO whitespace removal
//TODO bold/emphasis
return resultItems;
}
}
function groupByFollowingY(textItems) {
const yArrays = [];
var itemsWithSameY = [];
var lastItem;
textItems.forEach(item => {
if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
itemsWithSameY.push(item);
} else {
yArrays.push(itemsWithSameY);
itemsWithSameY = [item];
}
lastItem = item;
})
yArrays.push(itemsWithSameY);
return yArrays;
}

View File

@ -0,0 +1,82 @@
import React from 'react';
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Code Blocks");
}
createSummaryView(parseResult:ParseResult) {
return <div>
Detected
{ ' ' + parseResult.summary.foundBlocks + ' ' } blocks.
</div>;
}
// TODO ==> combine quotes follow each other
transform(parseResult:ParseResult) {
const {mostUsedHeight} = parseResult.globals;
var foundBlocks = 0;
const textCombiner = new TextItemCombiner({});
parseResult.content.forEach(page => {
var minX = 999;
page.blocks.forEach(block => {
block.textItems.forEach(item => {
minX = Math.min(minX, item.x)
});
});
if (minX < 999) {
const itemAreSuitable = (items) => {
for ( let item of items ) {
if (item.x == minX) {
return false;
}
if (item.height > mostUsedHeight + 1) {
return false;
}
}
return true;
};
const newBlocks = [];
page.blocks.forEach(block => {
if (block.type) {
newBlocks.push(block);
} else {
if (itemAreSuitable(block.textItems)) {
block.annotation = REMOVED_ANNOTATION;
newBlocks.push(block);
newBlocks.push(new PdfBlock({
type: 'Code/Quote',
annotation: ADDED_ANNOTATION,
textItems: textCombiner.combine(block.textItems)
}));
} else {
newBlocks.push(block);
}
}
});
page.blocks = newBlocks;
}
});
return new ParseResult({
...parseResult,
summary: {
foundBlocks: foundBlocks
}
});
}
}

View File

@ -1,6 +1,8 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
export default class ToPdfBlockViewTransformation extends Transformation {
@ -29,4 +31,13 @@ export default class ToPdfBlockViewTransformation extends Transformation {
showWhitespaces={ this.showWhitespaces } />;
}
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.content.forEach(page => {
page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
page.blocks.forEach(block => block.annotation = null);
});
return parseResult;
}
}

View File

@ -25,8 +25,9 @@ export default class ToTextBlocks extends Transformation {
// }
text += '\n' + item.text;
});
const category = block.type ? block.type : 'Unknown';
blocks.push({
category: 'Unknown',
category: category,
text: text
});
});