mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-28 18:53:40 +01:00
WIP initial TOC detection
This commit is contained in:
parent
bed3fd357b
commit
2783d724e5
@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
|||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||||
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
@ -29,6 +30,7 @@ export default class AppState {
|
|||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
new DetectPdfBlocks(),
|
new DetectPdfBlocks(),
|
||||||
|
new DetectTOC(),
|
||||||
new DetectCodeBlocks(),
|
new DetectCodeBlocks(),
|
||||||
// new DetectFormats(),
|
// new DetectFormats(),
|
||||||
// new CombineSameY(),
|
// new CombineSameY(),
|
||||||
|
@ -1,17 +1,25 @@
|
|||||||
import PdfBlock from './BlockPage.jsx';
|
import PdfBlock from './BlockPage.jsx';
|
||||||
|
|
||||||
export const CODE_BLOCK = "Code/Quote";
|
|
||||||
export const HEADLINE1 = "Headline 1";
|
export const HEADLINE1 = "Headline 1";
|
||||||
|
export const CODE_BLOCK = "Code/Quote";
|
||||||
|
export const TOC_BLOCK = "TOC";
|
||||||
|
|
||||||
export function blockToText(block: PdfBlock) {
|
export function blockToText(block: PdfBlock) {
|
||||||
const text = concatTextItems(block);
|
|
||||||
switch (block.type) {
|
switch (block.type) {
|
||||||
case CODE_BLOCK:
|
case CODE_BLOCK:
|
||||||
return '```\n' + text + '```'
|
return '```\n' + concatTextItems(block) + '```'
|
||||||
case HEADLINE1:
|
case TOC_BLOCK:
|
||||||
return '#' + text;
|
//TODO 2nd level
|
||||||
default:
|
//TODO real links
|
||||||
|
var text = '';
|
||||||
|
block.textItems.forEach(item => {
|
||||||
|
text += '- ' + item.text + '\n';
|
||||||
|
});
|
||||||
return text;
|
return text;
|
||||||
|
case HEADLINE1:
|
||||||
|
return '#' + concatTextItems(block);
|
||||||
|
default:
|
||||||
|
return concatTextItems(block);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,26 +6,32 @@ export default class TextItemCombiner {
|
|||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.transformEmphasis = options.transformEmphasis || true;
|
this.transformEmphasis = options.transformEmphasis || true;
|
||||||
console.debug(this.transformEmphasis);
|
this.maxYDerivation = options.transformEmphasis || 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns a TextItem array new items
|
// returns a TextItem array new items
|
||||||
combine(textItems: TextItem[]) {
|
combine(textItems: TextItem[]) {
|
||||||
const resultItems = [];
|
const resultItems = [];
|
||||||
const groupedItems = groupByFollowingY(textItems);
|
const groupedItems = this.groupByFollowingY(textItems);
|
||||||
groupedItems.forEach(itemGroup => {
|
groupedItems.forEach(itemGroup => {
|
||||||
if (itemGroup.length == 1) {
|
if (itemGroup.length == 1) {
|
||||||
resultItems.push(itemGroup[0]);
|
resultItems.push(itemGroup[0]);
|
||||||
} else {
|
} else {
|
||||||
var text = '';
|
var text = '';
|
||||||
|
var maxHeight = 0;
|
||||||
|
var widthSum = 0;
|
||||||
itemGroup.forEach(item => {
|
itemGroup.forEach(item => {
|
||||||
// item.annotation = REMOVED_ANNOTATION;
|
// item.annotation = REMOVED_ANNOTATION;
|
||||||
// resultItems.push(item);
|
// resultItems.push(item);
|
||||||
text += item.text;
|
text += item.text;
|
||||||
|
widthSum += item.width;
|
||||||
});
|
});
|
||||||
//TODO set other elements
|
//TODO set other elements
|
||||||
resultItems.push(new TextItem({
|
resultItems.push(new TextItem({
|
||||||
|
...itemGroup[0],
|
||||||
text: text,
|
text: text,
|
||||||
|
height: maxHeight,
|
||||||
|
width: widthSum,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -36,21 +42,20 @@ export default class TextItemCombiner {
|
|||||||
return resultItems;
|
return resultItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
groupByFollowingY(textItems) {
|
||||||
|
const yArrays = [];
|
||||||
function groupByFollowingY(textItems) {
|
var itemsWithSameY = [];
|
||||||
const yArrays = [];
|
var lastItem;
|
||||||
var itemsWithSameY = [];
|
textItems.forEach(item => {
|
||||||
var lastItem;
|
if (itemsWithSameY.length == 0 || Math.abs(lastItem.y - item.y) <= this.maxYDerivation) {
|
||||||
textItems.forEach(item => {
|
itemsWithSameY.push(item);
|
||||||
if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
|
} else {
|
||||||
itemsWithSameY.push(item);
|
yArrays.push(itemsWithSameY);
|
||||||
} else {
|
itemsWithSameY = [item];
|
||||||
yArrays.push(itemsWithSameY);
|
}
|
||||||
itemsWithSameY = [item];
|
lastItem = item;
|
||||||
}
|
})
|
||||||
lastItem = item;
|
yArrays.push(itemsWithSameY);
|
||||||
})
|
return yArrays;
|
||||||
yArrays.push(itemsWithSameY);
|
}
|
||||||
return yArrays;
|
|
||||||
}
|
}
|
||||||
|
@ -20,8 +20,6 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
|||||||
</div>;
|
</div>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO ==> combine quotes follow each other
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
||||||
|
|
||||||
|
89
src/javascript/models/transformations/DetectTOC.jsx
Normal file
89
src/javascript/models/transformations/DetectTOC.jsx
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfBlock from '../PdfBlock.jsx';
|
||||||
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
import { TOC_BLOCK } from '../MarkdownElements.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
import { groupByFollowingY } from '../TextItemCombiner.jsx';
|
||||||
|
import { isNumber, isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
|
//Detect table of contents pages
|
||||||
|
export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Table of Contents");
|
||||||
|
}
|
||||||
|
|
||||||
|
createSummaryView(parseResult:ParseResult) {
|
||||||
|
return <div>
|
||||||
|
Detected
|
||||||
|
{ ' ' + parseResult.summary.foundTocPages + ' ' } table of content pages.
|
||||||
|
</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
|
||||||
|
var foundTocPages = 0;
|
||||||
|
var x = Math.min(12, parseResult.content.length);
|
||||||
|
const textCombiner = new TextItemCombiner({});
|
||||||
|
parseResult.content.slice(0, x).forEach(page => {
|
||||||
|
var linesCount = 0;
|
||||||
|
var linesWithDigitsCount = 0;
|
||||||
|
var lineItemsWithDigits = [];
|
||||||
|
var headlineBlock;
|
||||||
|
page.blocks.forEach(block => {
|
||||||
|
var blockHasLinesWithDigits = false;
|
||||||
|
const itemsGroupedByY = textCombiner.combine(block.textItems);
|
||||||
|
itemsGroupedByY.forEach(lineItem => {
|
||||||
|
linesCount++
|
||||||
|
var lineText = lineItem.text.replace(/\./g, '').trim();
|
||||||
|
var endsWithDigit = false;
|
||||||
|
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
||||||
|
lineText = lineText.substring(0, lineText.length - 2);
|
||||||
|
endsWithDigit = true;
|
||||||
|
}
|
||||||
|
lineText = lineText.trim();
|
||||||
|
if (endsWithDigit) {
|
||||||
|
linesWithDigitsCount++;
|
||||||
|
blockHasLinesWithDigits = true;
|
||||||
|
lineItemsWithDigits.push(new TextItem({
|
||||||
|
...lineItem,
|
||||||
|
text: lineText
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!blockHasLinesWithDigits) {
|
||||||
|
if (!headlineBlock) {
|
||||||
|
headlineBlock = block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
||||||
|
foundTocPages++;
|
||||||
|
page.blocks.forEach(block => {
|
||||||
|
if (block !== headlineBlock) {
|
||||||
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
page.blocks.push(new PdfBlock({
|
||||||
|
textItems: lineItemsWithDigits,
|
||||||
|
type: TOC_BLOCK,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
summary: {
|
||||||
|
foundTocPages: foundTocPages
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user