WIP add detect Lists function

This commit is contained in:
Johannes Zillmann 2017-02-19 14:23:35 +01:00
parent edfa76b033
commit a3b6a26437
8 changed files with 213 additions and 13 deletions

View File

@ -5,6 +5,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx';
@ -31,6 +32,7 @@ export default class AppState {
new VerticalToHorizontal(),
new DetectPdfBlocks(),
new DetectTOC(),
new DetectLists(),
new DetectCodeBlocks(),
// new DetectFormats(),
// new CombineSameY(),

View File

@ -1,6 +1,8 @@
import PdfBlock from './BlockPage.jsx';
export const HEADLINE1 = "Headline 1";
export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC";

View File

@ -20,11 +20,19 @@ export default class TextItemCombiner {
var text = '';
var maxHeight = 0;
var widthSum = 0;
var lastItem;
itemGroup.forEach(item => {
// item.annotation = REMOVED_ANNOTATION;
// resultItems.push(item);
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) {
text += ' ';
}
}
text += item.text;
widthSum += item.width;
lastItem = item;
});
//TODO set other elements
resultItems.push(new TextItem({

View File

@ -5,18 +5,19 @@ import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { CODE_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Code Blocks");
super("Detect Code/Quotes");
}
createSummaryView(parseResult:ParseResult) {
return <div>
Detected
{ ' ' + parseResult.summary.foundBlocks + ' ' } blocks.
{ ' ' + parseResult.summary.foundBlocks + ' ' } code/quote blocks.
</div>;
}
@ -27,14 +28,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
const textCombiner = new TextItemCombiner({});
parseResult.content.forEach(page => {
var minX = 999;
page.blocks.forEach(block => {
block.textItems.forEach(item => {
minX = Math.min(minX, item.x)
});
});
if (minX < 999) {
var minX = minXFromBlocks(page.blocks);
if (minX) {
const itemAreSuitable = (items) => {
for ( let item of items ) {
if (item.x == minX) {

View File

@ -0,0 +1,148 @@
import React from 'react';
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectLists extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Lists");
}
createSummaryView(parseResult:ParseResult) {
return <div>
Detected
{ ' ' + parseResult.summary.foundBlocks + ' ' } list blocks.
</div>;
}
transform(parseResult:ParseResult) {
var foundBlocks = 0;
const textCombiner = new TextItemCombiner({});
parseResult.content.forEach(page => {
var minX = minXFromBlocks(page.blocks);
if (minX) {
const newBlocks = [];
page.blocks.forEach(block => {
newBlocks.push(block);
if (!block.type) {
const yGroupedItems = textCombiner.combine(block.textItems);
if (hasMoreThan2LineItems(yGroupedItems)) {
block.annotation = REMOVED_ANNOTATION;
foundBlocks++;
var lastItemX;
var currentLevel = 0;
var itemsBeforeFirstLineItem = [];
var listBlockItems = [];
var xByLevel = {};
const pushLineItem = (originalItem, text, setLevel) => {
if (lastItemX && setLevel) {
if (originalItem.x > lastItemX) {
currentLevel++;
xByLevel[originalItem.x] = currentLevel;
} else if (originalItem.x < lastItemX) {
currentLevel = xByLevel[originalItem.x];
}
} else {
xByLevel[originalItem.x] = 0;
}
listBlockItems.push(new TextItem({
...originalItem,
text: ' '.repeat(currentLevel * 3) + text
}));
lastItemX = originalItem.x;
};
yGroupedItems.forEach(lineItem => {
if (isPlainListItem(lineItem.text)) {
var text = lineItem.text;
text = text.substring(1, text.length).trim();
text = '- ' + text;
pushLineItem(lineItem, text, true);
} else if (isNumberedListItem(lineItem.text)) {
var numberedText = lineItem.text;
numberedText
pushLineItem(lineItem, numberedText, true);
} else {
if (lastItemX) {
pushLineItem(lineItem, lineItem.text, false);
} else {
itemsBeforeFirstLineItem.push(lineItem);
}
}
});
if (itemsBeforeFirstLineItem.length > 0) {
newBlocks.push(new PdfBlock({
textItems: itemsBeforeFirstLineItem,
type: PARAGRAPH,
annotation: ADDED_ANNOTATION
}));
}
//TODO display with whitespace pre support
newBlocks.push(new PdfBlock({
textItems: listBlockItems,
type: LIST_BLOCK,
annotation: ADDED_ANNOTATION
}));
}
}
});
page.blocks = newBlocks;
}
});
return new ParseResult({
...parseResult,
summary: {
foundBlocks: foundBlocks
}
});
}
}
function hasMoreThan2LineItems(textItems:TextItem[]) {
var numberOfListItemLineStarts = 0;
for ( let item of textItems ) {
if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
numberOfListItemLineStarts++;
if (numberOfListItemLineStarts == 2) {
return true;
}
}
}
return false;
}
function isPlainListItem(string) {
if (string.startsWith('-')) {
return true;
}
if (string.startsWith('•')) {
return true;
}
return false;
}
function isNumberedListItem(string) {
if (!isNaN(parseInt(string.charAt(0)))) {
return true;
}
return false;
}

View File

@ -3,6 +3,7 @@ import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlockPage from '../PdfBlockPage.jsx';
import PdfBlock from '../PdfBlock.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
@ -21,6 +22,7 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
const newContent = parseResult.content.map(page => {
var minX = minXFromTextItems(page.textItems);
const blocks = [];
var textItemsInBlock = [];
const completBlock = () => {
@ -31,9 +33,9 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
};
var lastItem;
page.textItems.forEach(item => {
if (lastItem) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2 || distance > mostUsedDistance) {
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
completBlock();
}
}
@ -59,3 +61,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
}
}
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2) {
//distance is negative - and not only a bit
return true;
}
var allowedDisctance = mostUsedDistance + 1;
if (lastItem.x == item.x && item.x > minX) {
//intended elements like lists often have greater spacing
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
}
if (distance > allowedDisctance) {
return true;
}
return false;
}

View File

@ -18,7 +18,7 @@ export default class ToMarkdown extends Transformation {
var text = '';
parseResult.content.forEach(page => {
page.blocks.forEach((block) => {
text += block.text + '\n\n';
text += block.text + '\n';
});
});
return new ParseResult({

View File

@ -0,0 +1,26 @@
import PdfBlock from './models/PdfBlock.jsx';
import TextItem from './models/TextItem.jsx';
export function minXFromBlocks(blocks:PdfBlock[]) {
var minX = 999;
blocks.forEach(block => {
block.textItems.forEach(item => {
minX = Math.min(minX, item.x)
});
});
if (minX == 999) {
return null;
}
return minX;
}
export function minXFromTextItems(items:TextItem) {
var minX = 999;
items.forEach(item => {
minX = Math.min(minX, item.x)
});
if (minX == 999) {
return null;
}
return minX;
}