mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-28 07:40:47 +01:00
WIP add detect Lists function
This commit is contained in:
parent
edfa76b033
commit
a3b6a26437
@ -5,6 +5,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
import DetectLists from './transformations/DetectLists.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
@ -31,6 +32,7 @@ export default class AppState {
|
||||
new VerticalToHorizontal(),
|
||||
new DetectPdfBlocks(),
|
||||
new DetectTOC(),
|
||||
new DetectLists(),
|
||||
new DetectCodeBlocks(),
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
|
@ -1,6 +1,8 @@
|
||||
import PdfBlock from './BlockPage.jsx';
|
||||
|
||||
export const HEADLINE1 = "Headline 1";
|
||||
export const PARAGRAPH = "Paragraph";
|
||||
export const LIST_BLOCK = "List";
|
||||
export const CODE_BLOCK = "Code/Quote";
|
||||
export const TOC_BLOCK = "TOC";
|
||||
|
||||
|
@ -20,11 +20,19 @@ export default class TextItemCombiner {
|
||||
var text = '';
|
||||
var maxHeight = 0;
|
||||
var widthSum = 0;
|
||||
var lastItem;
|
||||
itemGroup.forEach(item => {
|
||||
// item.annotation = REMOVED_ANNOTATION;
|
||||
// resultItems.push(item);
|
||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||
if (xDistance >= 5) {
|
||||
text += ' ';
|
||||
}
|
||||
}
|
||||
text += item.text;
|
||||
widthSum += item.width;
|
||||
lastItem = item;
|
||||
});
|
||||
//TODO set other elements
|
||||
resultItems.push(new TextItem({
|
||||
|
@ -5,18 +5,19 @@ import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { CODE_BLOCK } from '../MarkdownElements.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code Blocks");
|
||||
super("Detect Code/Quotes");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } blocks.
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } code/quote blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
@ -27,14 +28,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = 999;
|
||||
page.blocks.forEach(block => {
|
||||
block.textItems.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
});
|
||||
});
|
||||
|
||||
if (minX < 999) {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
if (minX) {
|
||||
const itemAreSuitable = (items) => {
|
||||
for ( let item of items ) {
|
||||
if (item.x == minX) {
|
||||
|
148
src/javascript/models/transformations/DetectLists.jsx
Normal file
148
src/javascript/models/transformations/DetectLists.jsx
Normal file
@ -0,0 +1,148 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Lists");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } list blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
if (minX) {
|
||||
const newBlocks = [];
|
||||
page.blocks.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type) {
|
||||
const yGroupedItems = textCombiner.combine(block.textItems);
|
||||
if (hasMoreThan2LineItems(yGroupedItems)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
foundBlocks++;
|
||||
|
||||
var lastItemX;
|
||||
var currentLevel = 0;
|
||||
var itemsBeforeFirstLineItem = [];
|
||||
var listBlockItems = [];
|
||||
var xByLevel = {};
|
||||
|
||||
const pushLineItem = (originalItem, text, setLevel) => {
|
||||
if (lastItemX && setLevel) {
|
||||
if (originalItem.x > lastItemX) {
|
||||
currentLevel++;
|
||||
xByLevel[originalItem.x] = currentLevel;
|
||||
} else if (originalItem.x < lastItemX) {
|
||||
currentLevel = xByLevel[originalItem.x];
|
||||
}
|
||||
} else {
|
||||
xByLevel[originalItem.x] = 0;
|
||||
}
|
||||
|
||||
|
||||
listBlockItems.push(new TextItem({
|
||||
...originalItem,
|
||||
text: ' '.repeat(currentLevel * 3) + text
|
||||
}));
|
||||
lastItemX = originalItem.x;
|
||||
|
||||
};
|
||||
|
||||
yGroupedItems.forEach(lineItem => {
|
||||
if (isPlainListItem(lineItem.text)) {
|
||||
var text = lineItem.text;
|
||||
text = text.substring(1, text.length).trim();
|
||||
text = '- ' + text;
|
||||
pushLineItem(lineItem, text, true);
|
||||
|
||||
} else if (isNumberedListItem(lineItem.text)) {
|
||||
var numberedText = lineItem.text;
|
||||
numberedText
|
||||
pushLineItem(lineItem, numberedText, true);
|
||||
} else {
|
||||
if (lastItemX) {
|
||||
pushLineItem(lineItem, lineItem.text, false);
|
||||
} else {
|
||||
itemsBeforeFirstLineItem.push(lineItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (itemsBeforeFirstLineItem.length > 0) {
|
||||
newBlocks.push(new PdfBlock({
|
||||
textItems: itemsBeforeFirstLineItem,
|
||||
type: PARAGRAPH,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
//TODO display with whitespace pre support
|
||||
newBlocks.push(new PdfBlock({
|
||||
textItems: listBlockItems,
|
||||
type: LIST_BLOCK,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
foundBlocks: foundBlocks
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function hasMoreThan2LineItems(textItems:TextItem[]) {
|
||||
var numberOfListItemLineStarts = 0;
|
||||
for ( let item of textItems ) {
|
||||
if (isPlainListItem(item.text) || isNumberedListItem(item.text)) {
|
||||
numberOfListItemLineStarts++;
|
||||
if (numberOfListItemLineStarts == 2) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isPlainListItem(string) {
|
||||
if (string.startsWith('-')) {
|
||||
return true;
|
||||
}
|
||||
if (string.startsWith('•')) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isNumberedListItem(string) {
|
||||
if (!isNaN(parseInt(string.charAt(0)))) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPage from '../PdfBlockPage.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import { minXFromTextItems } from '../../textItemFunctions.jsx';
|
||||
|
||||
export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
|
||||
@ -21,6 +22,7 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
const newContent = parseResult.content.map(page => {
|
||||
var minX = minXFromTextItems(page.textItems);
|
||||
const blocks = [];
|
||||
var textItemsInBlock = [];
|
||||
const completBlock = () => {
|
||||
@ -31,9 +33,9 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
};
|
||||
var lastItem;
|
||||
page.textItems.forEach(item => {
|
||||
|
||||
if (lastItem) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2 || distance > mostUsedDistance) {
|
||||
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
|
||||
completBlock();
|
||||
}
|
||||
}
|
||||
@ -59,3 +61,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2) {
|
||||
//distance is negative - and not only a bit
|
||||
return true;
|
||||
}
|
||||
var allowedDisctance = mostUsedDistance + 1;
|
||||
if (lastItem.x == item.x && item.x > minX) {
|
||||
//intended elements like lists often have greater spacing
|
||||
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
||||
}
|
||||
if (distance > allowedDisctance) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
@ -18,7 +18,7 @@ export default class ToMarkdown extends Transformation {
|
||||
var text = '';
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks.forEach((block) => {
|
||||
text += block.text + '\n\n';
|
||||
text += block.text + '\n';
|
||||
});
|
||||
});
|
||||
return new ParseResult({
|
||||
|
26
src/javascript/textItemFunctions.jsx
Normal file
26
src/javascript/textItemFunctions.jsx
Normal file
@ -0,0 +1,26 @@
|
||||
import PdfBlock from './models/PdfBlock.jsx';
|
||||
import TextItem from './models/TextItem.jsx';
|
||||
|
||||
export function minXFromBlocks(blocks:PdfBlock[]) {
|
||||
var minX = 999;
|
||||
blocks.forEach(block => {
|
||||
block.textItems.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
});
|
||||
});
|
||||
if (minX == 999) {
|
||||
return null;
|
||||
}
|
||||
return minX;
|
||||
}
|
||||
|
||||
export function minXFromTextItems(items:TextItem) {
|
||||
var minX = 999;
|
||||
items.forEach(item => {
|
||||
minX = Math.min(minX, item.x)
|
||||
});
|
||||
if (minX == 999) {
|
||||
return null;
|
||||
}
|
||||
return minX;
|
||||
}
|
Loading…
Reference in New Issue
Block a user