mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
[WIP] small fixes
This commit is contained in:
parent
5827379d1b
commit
1fcd08f6d5
@ -1,5 +1,5 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
import TextItem from './TextItem.jsx';
|
||||||
import { isNumber, isDigit } from '../functions.jsx'
|
import { isNumber } from '../functions.jsx'
|
||||||
import { sortByX } from '../textItemFunctions.jsx'
|
import { sortByX } from '../textItemFunctions.jsx'
|
||||||
|
|
||||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
//Combines text items which are on the same Y at the same time doing inline transformations like
|
||||||
@ -39,6 +39,7 @@ export default class TextItemCombiner {
|
|||||||
text += item.text;
|
text += item.text;
|
||||||
widthSum += item.width;
|
widthSum += item.width;
|
||||||
lastItem = item;
|
lastItem = item;
|
||||||
|
maxHeight = Math.max(maxHeight, item.height);
|
||||||
});
|
});
|
||||||
resultItems.push(new TextItem({
|
resultItems.push(new TextItem({
|
||||||
...itemGroup[0],
|
...itemGroup[0],
|
||||||
|
@ -34,7 +34,7 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
|||||||
type: FOOTNOTE_BLOCK,
|
type: FOOTNOTE_BLOCK,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
parsedElements: combineResult.parsedElements
|
parsedElements: combineResult.parsedElements
|
||||||
})
|
});
|
||||||
newBlocks.push(lastFootnote);
|
newBlocks.push(lastFootnote);
|
||||||
} else if (lastFootnote) {
|
} else if (lastFootnote) {
|
||||||
// likely to be the second line of aboves footnote
|
// likely to be the second line of aboves footnote
|
||||||
|
@ -4,7 +4,7 @@ import TextItem from '../TextItem.jsx';
|
|||||||
import PdfBlock from '../PdfBlock.jsx';
|
import PdfBlock from '../PdfBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { TOC_BLOCK } from '../MarkdownElements.jsx';
|
import { TOC_BLOCK, HEADLINE2 } from '../MarkdownElements.jsx';
|
||||||
import { isDigit } from '../../functions.jsx'
|
import { isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages
|
//Detect table of contents pages
|
||||||
@ -17,13 +17,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {mostUsedDistance} = parseResult.globals;
|
const {mostUsedDistance} = parseResult.globals;
|
||||||
var foundTocPages = 0;
|
var foundTocPages = 0;
|
||||||
var x = Math.min(12, parseResult.content.length);
|
const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
|
||||||
const textCombiner = new TextItemCombiner({
|
const textCombiner = new TextItemCombiner({
|
||||||
mostUsedDistance: mostUsedDistance
|
mostUsedDistance: mostUsedDistance
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
parseResult.content.slice(0, x).forEach(page => {
|
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||||
var linesCount = 0;
|
var linesCount = 0;
|
||||||
var linesWithDigitsCount = 0;
|
var linesWithDigitsCount = 0;
|
||||||
var lineItemsWithDigits = [];
|
var lineItemsWithDigits = [];
|
||||||
@ -49,25 +49,32 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
if (!blockHasLinesWithDigits) {
|
if (!headlineBlock && !blockHasLinesWithDigits) {
|
||||||
if (!headlineBlock) {
|
|
||||||
headlineBlock = block;
|
headlineBlock = block;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
||||||
foundTocPages++;
|
foundTocPages++;
|
||||||
page.blocks.forEach(block => {
|
const newBlocks = [];
|
||||||
if (block !== headlineBlock) {
|
page.blocks.forEach((block) => {
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
|
newBlocks.push(block);
|
||||||
|
|
||||||
|
if (block === headlineBlock) {
|
||||||
|
newBlocks.push(new PdfBlock({
|
||||||
|
textItems: textCombiner.combine(block.textItems).textItems,
|
||||||
|
type: HEADLINE2,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.blocks.push(new PdfBlock({
|
newBlocks.push(new PdfBlock({
|
||||||
textItems: lineItemsWithDigits,
|
textItems: lineItemsWithDigits,
|
||||||
type: TOC_BLOCK,
|
type: TOC_BLOCK,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
|
page.blocks = newBlocks;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user