mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
WIP Introduce PdfBlockView
* Add vertical to horizontal transformation * Improve header/footer removal
This commit is contained in:
parent
a92e384249
commit
767462bc9b
46
src/javascript/components/debug/PdfBlockPageView.jsx
Normal file
46
src/javascript/components/debug/PdfBlockPageView.jsx
Normal file
@ -0,0 +1,46 @@
|
||||
import React from 'react';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
|
||||
// View for a PdfBlockPage
|
||||
export default class PdfBlockPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
pdfPage: React.PropTypes.object.isRequired,
|
||||
modificationsOnly: React.PropTypes.bool.isRequired,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
render() {
|
||||
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
|
||||
var numberOfNonEmptyBlocks = 0;
|
||||
const blockTables = pdfPage.blocks.map((block, i) => {
|
||||
var textItems = block.textItems;
|
||||
if (modificationsOnly) {
|
||||
textItems = textItems.filter(item => item.annotation);
|
||||
}
|
||||
if (textItems.length == 0 && modificationsOnly) {
|
||||
return <div key={ i } />
|
||||
} else {
|
||||
numberOfNonEmptyBlocks++;
|
||||
return <div key={ i }>
|
||||
<h4>Block { i + i }</h4>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
</div>
|
||||
}
|
||||
});
|
||||
|
||||
var content;
|
||||
if (numberOfNonEmptyBlocks == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
const header = "Page " + (pdfPage.index + 1);
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
{ blockTables }
|
||||
</div>
|
||||
}
|
||||
return (
|
||||
content
|
||||
);
|
||||
}
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
import React from 'react';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
// View for a PdfPage
|
||||
export default class PdfPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
@ -11,89 +11,23 @@ export default class PdfPageView extends React.Component {
|
||||
};
|
||||
|
||||
render() {
|
||||
const header = "Page " + (this.props.pdfPage.index + 1);
|
||||
const {modificationsOnly, showWhitespaces} = this.props;
|
||||
var textItems = this.props.pdfPage.textItems;
|
||||
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
|
||||
const header = "Page " + (pdfPage.index + 1);
|
||||
var textItems = pdfPage.textItems;
|
||||
if (modificationsOnly) {
|
||||
textItems = textItems.filter(item => item.annotation);
|
||||
}
|
||||
|
||||
var content = <div/>
|
||||
if (!modificationsOnly || textItems.length > 0) {
|
||||
var content;
|
||||
if (textItems.length == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
<Table responsive>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
<th>
|
||||
X
|
||||
</th>
|
||||
<th>
|
||||
Y
|
||||
</th>
|
||||
<th>
|
||||
Width
|
||||
</th>
|
||||
<th>
|
||||
Height
|
||||
</th>
|
||||
<th>
|
||||
Font
|
||||
<br/>(asc/desc)
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{ textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
|
||||
color: textItem.annotation.color
|
||||
} : null }>
|
||||
<td>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ i }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
{ showWhitespaces ? (
|
||||
<pre style={ textItem.annotation ? {
|
||||
color: textItem.annotation.color,
|
||||
display: 'inline-block',
|
||||
} : {
|
||||
display: 'inline-block'
|
||||
} }>{ textItem.text }</pre>
|
||||
) : (textItem.text) }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.x }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.y }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.width }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.height }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.font }
|
||||
<br/>
|
||||
{ textItem.fontAscent + ' / ' + textItem.fontDescent }
|
||||
</td>
|
||||
</tr>
|
||||
) }
|
||||
</tbody>
|
||||
</Table>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
content
|
||||
);
|
||||
|
92
src/javascript/components/debug/TextItemTable.jsx
Normal file
92
src/javascript/components/debug/TextItemTable.jsx
Normal file
@ -0,0 +1,92 @@
|
||||
import React from 'react';
|
||||
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
// Displays an array of TextItem as a table
|
||||
export default class TextItemTable extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
textItems: React.PropTypes.array.isRequired,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
render() {
|
||||
const {showWhitespaces, textItems} = this.props;
|
||||
const tableHeader = <thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
<th>
|
||||
X
|
||||
</th>
|
||||
<th>
|
||||
Y
|
||||
</th>
|
||||
<th>
|
||||
Width
|
||||
</th>
|
||||
<th>
|
||||
Height
|
||||
</th>
|
||||
<th>
|
||||
Font
|
||||
<br/>(asc/desc)
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
|
||||
const textItemRows = textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
|
||||
color: textItem.annotation.color
|
||||
} : null }>
|
||||
<td>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ i }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
{ showWhitespaces ? (
|
||||
<pre style={ textItem.annotation ? {
|
||||
color: textItem.annotation.color,
|
||||
display: 'inline-block',
|
||||
} : {
|
||||
display: 'inline-block'
|
||||
} }>{ textItem.text }</pre>
|
||||
) : (textItem.text) }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.x }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.y }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.width }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.height }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.font }
|
||||
<br/>
|
||||
{ textItem.fontAscent + ' / ' + textItem.fontDescent }
|
||||
</td>
|
||||
</tr>
|
||||
)
|
||||
|
||||
return (
|
||||
<Table responsive>
|
||||
{ tableHeader }
|
||||
<tbody>
|
||||
{ textItemRows }
|
||||
</tbody>
|
||||
</Table>
|
||||
);
|
||||
}
|
||||
}
|
@ -1,15 +1,19 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import SplitInBlocks from './transformations/SplitInBlocks.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
|
||||
// Holds the state of the Application
|
||||
@ -22,15 +26,19 @@ export default class AppState {
|
||||
this.pdfPages = [];
|
||||
this.transformations = [
|
||||
new CalculateGlobalStats(),
|
||||
new DetectFormats(),
|
||||
new CombineSameY(),
|
||||
new RemoveWhitespaces(),
|
||||
new DetectFootnotes(),
|
||||
new DetectLinks(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new HeadlineDetector(),
|
||||
new HeadlineToUppercase(),
|
||||
new ToBlockSystem(),
|
||||
new VerticalToHorizontal(),
|
||||
new SplitInBlocks(),
|
||||
// new DetectCodeBlocks(),
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
// new RemoveWhitespaces(),
|
||||
// new DetectFootnotes(),
|
||||
// new DetectLinks(),
|
||||
// new HeadlineDetector(),
|
||||
// new HeadlineToUppercase(),
|
||||
// new ToBlockSystem(),
|
||||
new ToTextBlocks(),
|
||||
new ToMarkdown()];
|
||||
|
||||
//bind functions
|
||||
|
8
src/javascript/models/PdfBlock.jsx
Normal file
8
src/javascript/models/PdfBlock.jsx
Normal file
@ -0,0 +1,8 @@
|
||||
// A block within a PdfPage
|
||||
export default class PdfBlock {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
}
|
||||
|
||||
}
|
9
src/javascript/models/PdfBlockPage.jsx
Normal file
9
src/javascript/models/PdfBlockPage.jsx
Normal file
@ -0,0 +1,9 @@
|
||||
// A page which holds TextItems grouped by block displayable via PdfPageBlockView
|
||||
export default class PdfBlockPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.blocks = options.blocks;
|
||||
}
|
||||
|
||||
}
|
@ -81,19 +81,33 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
||||
|
||||
|
||||
parseResult.globals = {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont
|
||||
}
|
||||
parseResult.summary = {
|
||||
heightToOccurrence: heightToOccurrence,
|
||||
fontToOccurrence: fontToOccurrence,
|
||||
distanceToOccurrence: distanceToOccurrence,
|
||||
}
|
||||
return parseResult;
|
||||
//Make a copy of the originals so all following transformation don't modify them
|
||||
const newContent = parseResult.content.map(pdfPage => {
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
}
|
||||
})
|
||||
};
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
globals: {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont,
|
||||
},
|
||||
summary: {
|
||||
heightToOccurrence: heightToOccurrence,
|
||||
fontToOccurrence: fontToOccurrence,
|
||||
distanceToOccurrence: distanceToOccurrence,
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
import React from 'react';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
@ -1,3 +1,4 @@
|
||||
import React from 'react';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
@ -5,12 +6,12 @@ import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
|
||||
function hashCodeIgnoringNumbers(string) {
|
||||
var hash = 0, i, charCode, len;
|
||||
if (string.length === 0) return hash;
|
||||
for (i = 0, len = string.length; i < len; i++) {
|
||||
charCode = string.charCodeAt(i);
|
||||
if (!isDigit(charCode)) {
|
||||
function hashCodeIgnoringSpacesAndNumbers(string) {
|
||||
var hash = 0;
|
||||
if (string.trim().length === 0) return hash;
|
||||
for (var i = 0; i < string.length; i++) {
|
||||
const charCode = string.charCodeAt(i);
|
||||
if (!isDigit(charCode) && charCode != 32 && charCode != 160) {
|
||||
hash = ((hash << 5) - hash) + charCode;
|
||||
hash |= 0; // Convert to 32bit integer
|
||||
}
|
||||
@ -18,10 +19,6 @@ function hashCodeIgnoringNumbers(string) {
|
||||
return hash;
|
||||
}
|
||||
|
||||
function combineCoordinates(textItem) {
|
||||
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
||||
return `${textItem.x}-${textItem.y}-${hashCode}`;
|
||||
}
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
@ -30,27 +27,88 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
super("Remove Repetitive Elements");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
<ul>
|
||||
<li>
|
||||
{ 'Removed Header: ' + parseResult.summary.removedHeader + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Removed Footers: ' + parseResult.summary.removedFooter + ' ' }
|
||||
</li>
|
||||
</ul>
|
||||
</div>;
|
||||
}
|
||||
|
||||
// The idea is the following:
|
||||
// - For each page, collect all items of the first, and all items of the last line
|
||||
// - Calculate how often these items occur accros all pages (hash ignoring numbers, whitespace, upper/lowercase)
|
||||
// - Delete items occuring on more then 2/3 of all pages
|
||||
transform(parseResult:ParseResult) {
|
||||
//build repetition counts for every element
|
||||
const repetitionCounts = {};
|
||||
|
||||
// find first and last lines per page
|
||||
const pageStore = [];
|
||||
const minLineHashRepetitions = {};
|
||||
const maxLineHashRepetitions = {};
|
||||
parseResult.content.forEach(pdfPage => {
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
|
||||
const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => {
|
||||
if (item.y < itemStore.minY) {
|
||||
itemStore.minElements = [item];
|
||||
itemStore.minY = item.y;
|
||||
} else if (item.y == itemStore.minY) {
|
||||
itemStore.minElements.push(item);
|
||||
}
|
||||
if (item.y > itemStore.maxY) {
|
||||
itemStore.maxElements = [item];
|
||||
itemStore.maxY = item.y;
|
||||
} else if (item.y == itemStore.maxY) {
|
||||
itemStore.maxElements.push(item);
|
||||
}
|
||||
return itemStore;
|
||||
}, {
|
||||
minY: 999,
|
||||
maxY: 0,
|
||||
minElements: [],
|
||||
maxElements: []
|
||||
});
|
||||
|
||||
const minLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.minElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
||||
const maxLineHash = hashCodeIgnoringSpacesAndNumbers(minMaxItems.maxElements.reduce((combinedString, item) => combinedString + item.text.trim().toUpperCase(), ''));
|
||||
pageStore.push({
|
||||
minElements: minMaxItems.minElements,
|
||||
maxElements: minMaxItems.maxElements,
|
||||
minLineHash: minLineHash,
|
||||
maxLineHash: maxLineHash
|
||||
});
|
||||
minLineHashRepetitions[minLineHash] = minLineHashRepetitions[minLineHash] ? minLineHashRepetitions[minLineHash] + 1 : 1;
|
||||
maxLineHashRepetitions[maxLineHash] = maxLineHashRepetitions[maxLineHash] ? maxLineHashRepetitions[maxLineHash] + 1 : 1;
|
||||
});
|
||||
|
||||
// annotate elements with repetition as removed
|
||||
parseResult.content.forEach(pdfPage => {
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
if (repetitionCounts[combinedCoordinates] > 1) {
|
||||
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
||||
textItem.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
// now annoate all removed items
|
||||
var removedHeader = 0;
|
||||
var removedFooter = 0;
|
||||
parseResult.content.forEach((pdfPage, i) => {
|
||||
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
|
||||
pageStore[i].minElements.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
});
|
||||
removedFooter++;
|
||||
}
|
||||
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
|
||||
pageStore[i].maxElements.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
});
|
||||
removedHeader++;
|
||||
}
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
removedHeader: removedHeader,
|
||||
removedFooter: removedFooter,
|
||||
}
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
|
65
src/javascript/models/transformations/SplitInBlocks.jsx
Normal file
65
src/javascript/models/transformations/SplitInBlocks.jsx
Normal file
@ -0,0 +1,65 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPage from '../PdfBlockPage.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
|
||||
export default class SplitInBlocks extends ToPdfBlockViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Split Into Blocks");
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return false;
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Splitted into
|
||||
{ ' ' + parseResult.summary.createdBlocks + ' ' } blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const blocks = [];
|
||||
var textItemsInBlock = [];
|
||||
const completBlock = () => {
|
||||
blocks.push(new PdfBlock({
|
||||
textItems: textItemsInBlock
|
||||
}));
|
||||
textItemsInBlock = [];
|
||||
};
|
||||
var lastItem;
|
||||
page.textItems.forEach(item => {
|
||||
if (lastItem) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2 || distance > mostUsedDistance) {
|
||||
completBlock();
|
||||
}
|
||||
}
|
||||
textItemsInBlock.push(item);
|
||||
lastItem = item;
|
||||
});
|
||||
completBlock();
|
||||
|
||||
createdBlocks += blocks.length;
|
||||
return new PdfBlockPage({
|
||||
...page,
|
||||
blocks: blocks
|
||||
});
|
||||
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
summary: {
|
||||
createdBlocks: createdBlocks
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
|
||||
|
||||
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
|
||||
export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name);
|
||||
if (this.constructor === ToPdfBlockViewTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <PdfBlockPageView
|
||||
key={ page.index }
|
||||
pdfPage={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
}
|
@ -2,7 +2,7 @@ import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPageView from '../../components/debug/PdfPageView.jsx';
|
||||
|
||||
// Abstract pdfView transformation
|
||||
// Abstract class for transformations producing a PdfPage to be shown in the PdfView
|
||||
export default class ToPdfViewTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
|
44
src/javascript/models/transformations/ToTextBlocks.jsx
Normal file
44
src/javascript/models/transformations/ToTextBlocks.jsx
Normal file
@ -0,0 +1,44 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import BlockPageView from '../../components/debug/BlockPageView.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import BlockPage from '../BlockPage.jsx';
|
||||
|
||||
export default class ToTextBlocks extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("To Text Blocks");
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
return <BlockPageView key={ page.index } page={ page } />;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const blocks = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks.forEach(block => {
|
||||
var text = '';
|
||||
block.textItems.forEach(item => {
|
||||
// if (item.markdownElement) {
|
||||
// text = item.markdownElement.transformText(item.text);
|
||||
// }
|
||||
text += '\n' + item.text;
|
||||
});
|
||||
blocks.push({
|
||||
category: 'Unknown',
|
||||
text: text
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: [new BlockPage({
|
||||
index: 0,
|
||||
blocks: blocks
|
||||
})],
|
||||
});
|
||||
}
|
||||
|
||||
}
|
101
src/javascript/models/transformations/VerticalToHorizontal.jsx
Normal file
101
src/javascript/models/transformations/VerticalToHorizontal.jsx
Normal file
@ -0,0 +1,101 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Converts vertical text to horizontal
|
||||
export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Vertical to Horizontal Text");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const newTextItems = [];
|
||||
// var oneCharacterItems = [];
|
||||
|
||||
// const applyTransformation = () => {
|
||||
// oneCharacterItems.forEach(item => {
|
||||
// item.annotation = REMOVED_ANNOTATION;
|
||||
// newTextItems.push(item);
|
||||
// //TODO add new
|
||||
// });
|
||||
// oneCharacterItems = [];
|
||||
// };
|
||||
// const rollbackTransformation = () => {
|
||||
// oneCharacterItems.forEach(item => {
|
||||
// newTextItems.push(item);
|
||||
// });
|
||||
// oneCharacterItems = [];
|
||||
// };
|
||||
|
||||
//TODO generic state machine code ?
|
||||
|
||||
page.textItems.reduce((oneCharacterItems, item) => {
|
||||
if (item.text.trim().length == 1) {
|
||||
if (oneCharacterItems.length == 0) {
|
||||
oneCharacterItems.push(item);
|
||||
} else {
|
||||
const lastItem = oneCharacterItems[oneCharacterItems.length - 1];
|
||||
if (lastItem.y - item.y > 5 && lastItem.font === item.font) {
|
||||
oneCharacterItems.push(item);
|
||||
} else {
|
||||
if (oneCharacterItems.length > 5) {
|
||||
var combinedText = '';
|
||||
var minX = 999;
|
||||
var maxY = 0;
|
||||
var sumWidth = 0;
|
||||
var maxHeight = 0;
|
||||
oneCharacterItems.forEach(oneCharacterItem => {
|
||||
oneCharacterItem.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(oneCharacterItem);
|
||||
combinedText += oneCharacterItem.text.trim();
|
||||
minX = Math.min(minX, oneCharacterItem.x);
|
||||
maxY = Math.max(maxY, oneCharacterItem.y);
|
||||
sumWidth += oneCharacterItem.width;
|
||||
maxHeight = Math.max(maxHeight, oneCharacterItem.height);
|
||||
});
|
||||
newTextItems.push(new TextItem({
|
||||
...oneCharacterItems[0],
|
||||
x: minX,
|
||||
y: maxY,
|
||||
width: sumWidth,
|
||||
height: maxHeight,
|
||||
text: combinedText,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
} else {
|
||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
}
|
||||
oneCharacterItems = [item];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
oneCharacterItems = [];
|
||||
newTextItems.push(item);
|
||||
}
|
||||
return oneCharacterItems;
|
||||
}, []);
|
||||
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
});
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user