WIP Introduce PdfBlockView

* Add vertical to horizontal transformation
* Improve header/footer removal
This commit is contained in:
Johannes Zillmann 2017-02-17 20:17:04 +01:00
parent a92e384249
commit 767462bc9b
14 changed files with 536 additions and 126 deletions

View File

@ -0,0 +1,46 @@
import React from 'react';
import TextItemTable from './TextItemTable.jsx';
// View for a PdfBlockPage
export default class PdfBlockPageView extends React.Component {
static propTypes = {
pdfPage: React.PropTypes.object.isRequired,
modificationsOnly: React.PropTypes.bool.isRequired,
showWhitespaces: React.PropTypes.bool
};
render() {
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
var numberOfNonEmptyBlocks = 0;
const blockTables = pdfPage.blocks.map((block, i) => {
var textItems = block.textItems;
if (modificationsOnly) {
textItems = textItems.filter(item => item.annotation);
}
if (textItems.length == 0 && modificationsOnly) {
return <div key={ i } />
} else {
numberOfNonEmptyBlocks++;
return <div key={ i }>
<h4>Block { i + i }</h4>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
</div>
}
});
var content;
if (numberOfNonEmptyBlocks == 0 && modificationsOnly) {
content = <div/>
} else {
const header = "Page " + (pdfPage.index + 1);
content = <div>
<h2>{ header }</h2>
{ blockTables }
</div>
}
return (
content
);
}
}

View File

@ -1,7 +1,7 @@
import React from 'react';
import TextItemTable from './TextItemTable.jsx';
import Table from 'react-bootstrap/lib/Table'
// View for a PdfPage
export default class PdfPageView extends React.Component {
static propTypes = {
@ -11,89 +11,23 @@ export default class PdfPageView extends React.Component {
};
render() {
const header = "Page " + (this.props.pdfPage.index + 1);
const {modificationsOnly, showWhitespaces} = this.props;
var textItems = this.props.pdfPage.textItems;
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
const header = "Page " + (pdfPage.index + 1);
var textItems = pdfPage.textItems;
if (modificationsOnly) {
textItems = textItems.filter(item => item.annotation);
}
var content = <div/>
if (!modificationsOnly || textItems.length > 0) {
var content;
if (textItems.length == 0 && modificationsOnly) {
content = <div/>
} else {
content = <div>
<h2>{ header }</h2>
<Table responsive>
<thead>
<tr>
<th>
#
</th>
<th>
Text
</th>
<th>
X
</th>
<th>
Y
</th>
<th>
Width
</th>
<th>
Height
</th>
<th>
Font
<br/>(asc/desc)
</th>
</tr>
</thead>
<tbody>
{ textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
color: textItem.annotation.color
} : null }>
<td>
<div style={ { textAlign: 'center' } }>
{ i }
</div>
<div style={ { textAlign: 'center' } }>
{ textItem.annotation ? textItem.annotation.category : '' }
</div>
</td>
<td>
{ showWhitespaces ? (
<pre style={ textItem.annotation ? {
color: textItem.annotation.color,
display: 'inline-block',
} : {
display: 'inline-block'
} }>{ textItem.text }</pre>
) : (textItem.text) }
</td>
<td>
{ textItem.x }
</td>
<td>
{ textItem.y }
</td>
<td>
{ textItem.width }
</td>
<td>
{ textItem.height }
</td>
<td>
{ textItem.font }
<br/>
{ textItem.fontAscent + ' / ' + textItem.fontDescent }
</td>
</tr>
) }
</tbody>
</Table>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
</div>
}
return (
content
);

View File

@ -0,0 +1,92 @@
import React from 'react';
import Table from 'react-bootstrap/lib/Table'
// Displays an array of TextItem as a table
export default class TextItemTable extends React.Component {
static propTypes = {
textItems: React.PropTypes.array.isRequired,
showWhitespaces: React.PropTypes.bool
};
render() {
const {showWhitespaces, textItems} = this.props;
const tableHeader = <thead>
<tr>
<th>
#
</th>
<th>
Text
</th>
<th>
X
</th>
<th>
Y
</th>
<th>
Width
</th>
<th>
Height
</th>
<th>
Font
<br/>(asc/desc)
</th>
</tr>
</thead>
const textItemRows = textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
color: textItem.annotation.color
} : null }>
<td>
<div style={ { textAlign: 'center' } }>
{ i }
</div>
<div style={ { textAlign: 'center' } }>
{ textItem.annotation ? textItem.annotation.category : '' }
</div>
</td>
<td>
{ showWhitespaces ? (
<pre style={ textItem.annotation ? {
color: textItem.annotation.color,
display: 'inline-block',
} : {
display: 'inline-block'
} }>{ textItem.text }</pre>
) : (textItem.text) }
</td>
<td>
{ textItem.x }
</td>
<td>
{ textItem.y }
</td>
<td>
{ textItem.width }
</td>
<td>
{ textItem.height }
</td>
<td>
{ textItem.font }
<br/>
{ textItem.fontAscent + ' / ' + textItem.fontDescent }
</td>
</tr>
)
return (
<Table responsive>
{ tableHeader }
<tbody>
{ textItemRows }
</tbody>
</Table>
);
}
}

View File

@ -1,15 +1,19 @@
import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import SplitInBlocks from './transformations/SplitInBlocks.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLinks from './transformations/DetectLinks.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
// Holds the state of the Application
@ -22,15 +26,19 @@ export default class AppState {
this.pdfPages = [];
this.transformations = [
new CalculateGlobalStats(),
new DetectFormats(),
new CombineSameY(),
new RemoveWhitespaces(),
new DetectFootnotes(),
new DetectLinks(),
new RemoveRepetitiveElements(),
new HeadlineDetector(),
new HeadlineToUppercase(),
new ToBlockSystem(),
new VerticalToHorizontal(),
new SplitInBlocks(),
// new DetectCodeBlocks(),
// new DetectFormats(),
// new CombineSameY(),
// new RemoveWhitespaces(),
// new DetectFootnotes(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(),
// new ToBlockSystem(),
new ToTextBlocks(),
new ToMarkdown()];
//bind functions

View File

@ -0,0 +1,8 @@
// A block within a PdfPage
export default class PdfBlock {
constructor(options) {
this.textItems = options.textItems;
}
}

View File

@ -0,0 +1,9 @@
// A page which holds TextItems grouped by block displayable via PdfPageBlockView
export default class PdfBlockPage {
constructor(options) {
this.index = options.index;
this.blocks = options.blocks;
}
}

View File

@ -81,19 +81,33 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
parseResult.globals = {
mostUsedHeight: mostUsedHeight,
mostUsedFont: mostUsedFont,
mostUsedDistance: mostUsedDistance,
maxHeight: maxHeight,
maxHeightFont: maxHeightFont
}
parseResult.summary = {
heightToOccurrence: heightToOccurrence,
fontToOccurrence: fontToOccurrence,
distanceToOccurrence: distanceToOccurrence,
}
return parseResult;
//Make a copy of the originals so all following transformation don't modify them
const newContent = parseResult.content.map(pdfPage => {
return {
...pdfPage,
textItems: pdfPage.textItems.map(textItem => {
return {
...textItem,
}
})
};
});
return new ParseResult({
...parseResult,
content: newContent,
globals: {
mostUsedHeight: mostUsedHeight,
mostUsedFont: mostUsedFont,
mostUsedDistance: mostUsedDistance,
maxHeight: maxHeight,
maxHeightFont: maxHeightFont,
},
summary: {
heightToOccurrence: heightToOccurrence,
fontToOccurrence: fontToOccurrence,
distanceToOccurrence: distanceToOccurrence,
}
});
}

View File

@ -1,6 +1,5 @@
import React from 'react';
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
import Annotation from '../Annotation.jsx';

View File

@ -1,3 +1,4 @@
import React from 'react';
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
@ -5,12 +6,12 @@ import { REMOVED_ANNOTATION } from '../Annotation.jsx';
import { isDigit } from '../../functions.jsx'
function hashCodeIgnoringNumbers(string) {
var hash = 0, i, charCode, len;
if (string.length === 0) return hash;
for (i = 0, len = string.length; i < len; i++) {
charCode = string.charCodeAt(i);
if (!isDigit(charCode)) {
function hashCodeIgnoringSpacesAndNumbers(string) {
var hash = 0;
if (string.trim().length === 0) return hash;
for (var i = 0; i < string.length; i++) {
const charCode = string.charCodeAt(i);
if (!isDigit(charCode) && charCode != 32 && charCode != 160) {
hash = ((hash << 5) - hash) + charCode;
hash |= 0; // Convert to 32bit integer
}
@ -18,10 +19,6 @@ function hashCodeIgnoringNumbers(string) {
return hash;
}
function combineCoordinates(textItem) {
var hashCode = hashCodeIgnoringNumbers(textItem.text);
return `${textItem.x}-${textItem.y}-${hashCode}`;
}
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
@ -30,27 +27,88 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
super("Remove Repetitive Elements");
}
createSummaryView(parseResult:ParseResult) {
return <div>
<ul>
<li>
{ 'Removed Header: ' + parseResult.summary.removedHeader + ' ' }
</li>
<li>
{ 'Removed Footers: ' + parseResult.summary.removedFooter + ' ' }
</li>
</ul>
</div>;
}
// The idea is the following:
// - For each page, collect all items of the first, and all items of the last line
// - Calculate how often these items occur accros all pages (hash ignoring numbers, whitespace, upper/lowercase)
// - Delete items occuring on more then 2/3 of all pages
transform(parseResult:ParseResult) {
//build repetition counts for every element
const repetitionCounts = {};
// find first and last lines per page
const pageStore = [];
const minLineHashRepetitions = {};
const maxLineHashRepetitions = {};
parseResult.content.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => {
if (item.y < itemStore.minY) {
itemStore.minElements = [item];
itemStore.minY = item.y;
} else if (item.y == itemStore.minY) {
itemStore.minElements.push(item);
}
if (item.y > itemStore.maxY) {
itemStore.maxElements = [item];
itemStore.maxY = item.y;
} else if (item.y == itemStore.maxY) {
itemStore.maxElements.push(item);
}
return itemStore;
}, {
minY: 999,
maxY: 0,
minElements: [],
maxElements: []
});
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
pageStore.push({
minElements: minMaxItems.minElements,
maxElements: minMaxItems.maxElements,
minLineHash: minLineHash,
maxLineHash: maxLineHash
});
minLineHashRepetitions[minLineHash] = minLineHashRepetitions[minLineHash] ? minLineHashRepetitions[minLineHash] + 1 : 1;
maxLineHashRepetitions[maxLineHash] = maxLineHashRepetitions[maxLineHash] ? maxLineHashRepetitions[maxLineHash] + 1 : 1;
});
// annotate elements with repetition as removed
parseResult.content.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
if (repetitionCounts[combinedCoordinates] > 1) {
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
textItem.annotation = REMOVED_ANNOTATION;
}
});
// now annoate all removed items
var removedHeader = 0;
var removedFooter = 0;
parseResult.content.forEach((pdfPage, i) => {
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
pageStore[i].minElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});
removedFooter++;
}
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
pageStore[i].maxElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});
removedHeader++;
}
});
return new ParseResult({
...parseResult,
summary: {
removedHeader: removedHeader,
removedFooter: removedFooter,
}
});
return parseResult;
}
completeTransform(parseResult:ParseResult) {

View File

@ -0,0 +1,65 @@
import React from 'react';
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlockPage from '../PdfBlockPage.jsx';
import PdfBlock from '../PdfBlock.jsx';
export default class SplitInBlocks extends ToPdfBlockViewTransformation {
constructor() {
super("Split Into Blocks");
}
showModificationCheckbox() {
return false;
}
createSummaryView(parseResult:ParseResult) {
return <div>
Splitted into
{ ' ' + parseResult.summary.createdBlocks + ' ' } blocks.
</div>;
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
const newContent = parseResult.content.map(page => {
const blocks = [];
var textItemsInBlock = [];
const completBlock = () => {
blocks.push(new PdfBlock({
textItems: textItemsInBlock
}));
textItemsInBlock = [];
};
var lastItem;
page.textItems.forEach(item => {
if (lastItem) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2 || distance > mostUsedDistance) {
completBlock();
}
}
textItemsInBlock.push(item);
lastItem = item;
});
completBlock();
createdBlocks += blocks.length;
return new PdfBlockPage({
...page,
blocks: blocks
});
});
return new ParseResult({
...parseResult,
content: newContent,
summary: {
createdBlocks: createdBlocks
}
});
}
}

View File

@ -0,0 +1,32 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
export default class ToPdfBlockViewTransformation extends Transformation {
constructor(name) {
super(name);
if (this.constructor === ToPdfBlockViewTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}
createPageView(page, modificationsOnly) {
return <PdfBlockPageView
key={ page.index }
pdfPage={ page }
modificationsOnly={ modificationsOnly }
showWhitespaces={ this.showWhitespaces } />;
}
}

View File

@ -2,7 +2,7 @@ import React from 'react';
import Transformation from './Transformation.jsx';
import PdfPageView from '../../components/debug/PdfPageView.jsx';
// Abstract pdfView transformation
// Abstract class for transformations producing a PdfPage to be shown in the PdfView
export default class ToPdfViewTransformation extends Transformation {
constructor(name) {

View File

@ -0,0 +1,44 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import BlockPageView from '../../components/debug/BlockPageView.jsx';
import ParseResult from '../ParseResult.jsx';
import BlockPage from '../BlockPage.jsx';
export default class ToTextBlocks extends Transformation {
constructor() {
super("To Text Blocks");
}
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
return <BlockPageView key={ page.index } page={ page } />;
}
transform(parseResult:ParseResult) {
const blocks = [];
parseResult.content.forEach(page => {
page.blocks.forEach(block => {
var text = '';
block.textItems.forEach(item => {
// if (item.markdownElement) {
// text = item.markdownElement.transformText(item.text);
// }
text += '\n' + item.text;
});
blocks.push({
category: 'Unknown',
text: text
});
});
});
return new ParseResult({
...parseResult,
content: [new BlockPage({
index: 0,
blocks: blocks
})],
});
}
}

View File

@ -0,0 +1,101 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
// Converts vertical text to horizontal
export default class VerticalToHorizontal extends ToPdfViewTransformation {
constructor() {
super("Vertical to Horizontal Text");
}
transform(parseResult:ParseResult) {
const newContent = parseResult.content.map(page => {
const newTextItems = [];
// var oneCharacterItems = [];
// const applyTransformation = () => {
// oneCharacterItems.forEach(item => {
// item.annotation = REMOVED_ANNOTATION;
// newTextItems.push(item);
// //TODO add new
// });
// oneCharacterItems = [];
// };
// const rollbackTransformation = () => {
// oneCharacterItems.forEach(item => {
// newTextItems.push(item);
// });
// oneCharacterItems = [];
// };
//TODO generic state machine code ?
page.textItems.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
} else {
const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
oneCharacterItems.push(item);
} else {
if (oneCharacterItems.length > 5) {
var combinedText = '';
var minX = 999;
var maxY = 0;
var sumWidth = 0;
var maxHeight = 0;
oneCharacterItems.forEach(oneCharacterItem => {
oneCharacterItem.annotation = REMOVED_ANNOTATION;
newTextItems.push(oneCharacterItem);
combinedText += oneCharacterItem.text.trim();
minX = Math.min(minX, oneCharacterItem.x);
maxY = Math.max(maxY, oneCharacterItem.y);
sumWidth += oneCharacterItem.width;
maxHeight = Math.max(maxHeight, oneCharacterItem.height);
});
newTextItems.push(new TextItem({
...oneCharacterItems[0],
x: minX,
y: maxY,
width: sumWidth,
height: maxHeight,
text: combinedText,
annotation: ADDED_ANNOTATION
}));
} else {
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
}
oneCharacterItems = [item];
}
}
} else {
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
oneCharacterItems = [];
newTextItems.push(item);
}
return oneCharacterItems;
}, []);
return {
...page,
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
});
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}