mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-16 09:50:53 +01:00
WIP initial quote/code detector with new TextItemCombiner
This commit is contained in:
parent
d78d9be8a3
commit
f93d1e4aa1
@ -12,25 +12,36 @@ export default class PdfBlockPageView extends React.Component {
|
||||
|
||||
render() {
|
||||
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
|
||||
var numberOfNonEmptyBlocks = 0;
|
||||
const blockTables = pdfPage.blocks.map((block, i) => {
|
||||
|
||||
var blocks = pdfPage.blocks;
|
||||
if (modificationsOnly) {
|
||||
blocks = blocks.filter(block => block.annotation);
|
||||
}
|
||||
|
||||
const blockTables = blocks.map((block, i) => {
|
||||
var textItems = block.textItems;
|
||||
if (modificationsOnly) {
|
||||
textItems = textItems.filter(item => item.annotation);
|
||||
}
|
||||
if (textItems.length == 0 && modificationsOnly) {
|
||||
return <div key={ i } />
|
||||
} else {
|
||||
numberOfNonEmptyBlocks++;
|
||||
return <div key={ i }>
|
||||
<h4>Block { i + i }</h4>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
</div>
|
||||
}
|
||||
const blockType = block.type ? ' - ' + block.type : null;
|
||||
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
||||
: null;
|
||||
const borderStyle = block.annotation ? {
|
||||
marginBottom: "20px",
|
||||
border: "solid thin " + block.annotation.color
|
||||
} : null;
|
||||
const colorStyle = block.annotation ? {
|
||||
color: block.annotation.color
|
||||
} : null;
|
||||
return <div key={ i }>
|
||||
<div style={ colorStyle }>
|
||||
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
|
||||
</div>
|
||||
<div style={ borderStyle }>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
</div>
|
||||
</div>
|
||||
});
|
||||
|
||||
var content;
|
||||
if (numberOfNonEmptyBlocks == 0 && modificationsOnly) {
|
||||
if (blocks.length == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
const header = "Page " + (pdfPage.index + 1);
|
||||
|
@ -81,7 +81,7 @@ export default class TextItemTable extends React.Component {
|
||||
)
|
||||
|
||||
return (
|
||||
<Table responsive>
|
||||
<Table responsive bordered>
|
||||
{ tableHeader }
|
||||
<tbody>
|
||||
{ textItemRows }
|
||||
|
@ -29,7 +29,7 @@ export default class AppState {
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
new DetectPdfBlocks(),
|
||||
// new DetectCodeBlocks(),
|
||||
new DetectCodeBlocks(),
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
// new RemoveWhitespaces(),
|
||||
|
@ -3,6 +3,8 @@ export default class PdfBlock {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
this.type = options.type;
|
||||
this.annotation = options.annotation;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
//A text iteme, i.e. a line, within a page
|
||||
//A text item, i.e. a line or a word within a page
|
||||
export default class TextItem {
|
||||
|
||||
constructor(options) {
|
||||
|
56
src/javascript/models/TextItemCombiner.jsx
Normal file
56
src/javascript/models/TextItemCombiner.jsx
Normal file
@ -0,0 +1,56 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from './Annotation.jsx';
|
||||
|
||||
//Combines text items which are on the same Y at the same time doing inline transformations like 'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemCombiner {
|
||||
|
||||
constructor(options) {
|
||||
this.transformEmphasis = options.transformEmphasis || true;
|
||||
console.debug(this.transformEmphasis);
|
||||
}
|
||||
|
||||
// returns a TextItem array new items
|
||||
combine(textItems: TextItem[]) {
|
||||
const resultItems = [];
|
||||
const groupedItems = groupByFollowingY(textItems);
|
||||
groupedItems.forEach(itemGroup => {
|
||||
if (itemGroup.length == 1) {
|
||||
resultItems.push(itemGroup[0]);
|
||||
} else {
|
||||
var text = '';
|
||||
itemGroup.forEach(item => {
|
||||
// item.annotation = REMOVED_ANNOTATION;
|
||||
// resultItems.push(item);
|
||||
text += item.text;
|
||||
});
|
||||
//TODO set other elements
|
||||
resultItems.push(new TextItem({
|
||||
text: text,
|
||||
}));
|
||||
}
|
||||
});
|
||||
|
||||
//TODO whitespace removal
|
||||
//TODO bold/emphasis
|
||||
|
||||
return resultItems;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function groupByFollowingY(textItems) {
|
||||
const yArrays = [];
|
||||
var itemsWithSameY = [];
|
||||
var lastItem;
|
||||
textItems.forEach(item => {
|
||||
if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
|
||||
itemsWithSameY.push(item);
|
||||
} else {
|
||||
yArrays.push(itemsWithSameY);
|
||||
itemsWithSameY = [item];
|
||||
}
|
||||
lastItem = item;
|
||||
})
|
||||
yArrays.push(itemsWithSameY);
|
||||
return yArrays;
|
||||
}
|
82
src/javascript/models/transformations/DetectCodeBlocks.jsx
Normal file
82
src/javascript/models/transformations/DetectCodeBlocks.jsx
Normal file
@ -0,0 +1,82 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code Blocks");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
// TODO ==> combine quotes follow each other
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedHeight} = parseResult.globals;
|
||||
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = 999;
|
||||
page.blocks.forEach(block => {
|
||||
block.textItems.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
});
|
||||
});
|
||||
|
||||
if (minX < 999) {
|
||||
const itemAreSuitable = (items) => {
|
||||
for ( let item of items ) {
|
||||
if (item.x == minX) {
|
||||
return false;
|
||||
}
|
||||
if (item.height > mostUsedHeight + 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
const newBlocks = [];
|
||||
page.blocks.forEach(block => {
|
||||
if (block.type) {
|
||||
newBlocks.push(block);
|
||||
} else {
|
||||
if (itemAreSuitable(block.textItems)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
newBlocks.push(block);
|
||||
newBlocks.push(new PdfBlock({
|
||||
type: 'Code/Quote',
|
||||
annotation: ADDED_ANNOTATION,
|
||||
textItems: textCombiner.combine(block.textItems)
|
||||
}));
|
||||
} else {
|
||||
newBlocks.push(block);
|
||||
}
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
foundBlocks: foundBlocks
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
|
||||
export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
@ -29,4 +31,13 @@ export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
|
||||
page.blocks.forEach(block => block.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
@ -25,8 +25,9 @@ export default class ToTextBlocks extends Transformation {
|
||||
// }
|
||||
text += '\n' + item.text;
|
||||
});
|
||||
const category = block.type ? block.type : 'Unknown';
|
||||
blocks.push({
|
||||
category: 'Unknown',
|
||||
category: category,
|
||||
text: text
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user