mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-26 12:31:52 +02:00
[WIP] detect TOC on text items, not on blocks
This commit is contained in:
parent
e2481bdd2a
commit
bd4c207ae3
@ -49,6 +49,9 @@ export default class TextItemTable extends React.Component {
|
|||||||
<div style={ { textAlign: 'center' } }>
|
<div style={ { textAlign: 'center' } }>
|
||||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||||
</div>
|
</div>
|
||||||
|
<div style={ { textAlign: 'center', color: 'brown' } }>
|
||||||
|
{ textItem.type ? textItem.type : '' }
|
||||||
|
</div>
|
||||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||||
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||||
|
import CompactLines from './transformations/CompactLines.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||||
import CompactLines from './transformations/CompactLines.jsx';
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
|
|
||||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
|
||||||
import DetectLists from './transformations/DetectLists.jsx'
|
import DetectLists from './transformations/DetectLists.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||||
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||||
@ -34,10 +34,10 @@ export default class AppState {
|
|||||||
new CompactLines(),
|
new CompactLines(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
|
new DetectTOC(),
|
||||||
|
|
||||||
new DetectPdfBlocks(),
|
new DetectPdfBlocks(),
|
||||||
new DetectFootnotes(),
|
new DetectFootnotes(),
|
||||||
new DetectTOC(),
|
|
||||||
new DetectLists(),
|
new DetectLists(),
|
||||||
new DetectCodeBlocks(),
|
new DetectCodeBlocks(),
|
||||||
new DetectHeadlines(),
|
new DetectHeadlines(),
|
||||||
|
@ -1,45 +1,34 @@
|
|||||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
|
||||||
import HeadlineFinder from '../HeadlineFinder.jsx';
|
import HeadlineFinder from '../HeadlineFinder.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||||
import { isDigit } from '../../functions.jsx'
|
import { isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages
|
//Detect table of contents pages
|
||||||
export default class DetectTOC extends ToTextItemBlockTransformation {
|
export default class DetectTOC extends ToTextItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect TOC");
|
super("Detect TOC");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {mostUsedDistance} = parseResult.globals;
|
|
||||||
const tocPages = [];
|
const tocPages = [];
|
||||||
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
||||||
const textCombiner = new TextItemCombiner({
|
|
||||||
mostUsedDistance: mostUsedDistance
|
|
||||||
});
|
|
||||||
|
|
||||||
const linkLeveler = new LinkLeveler();
|
const linkLeveler = new LinkLeveler();
|
||||||
var tocLinks = [];
|
var tocLinks = [];
|
||||||
var lastTocPage;
|
var lastTocPage;
|
||||||
|
var headlineItem;
|
||||||
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||||
var linesCount = 0;
|
const lineItemsWithDigits = [];
|
||||||
var linesWithDigitsCount = 0;
|
const unknownLines = new Set();
|
||||||
var lineItemsWithDigits = [];
|
|
||||||
const unknownBlocks = new Set();
|
|
||||||
var headlineBlock;
|
|
||||||
const pageTocLinks = [];
|
const pageTocLinks = [];
|
||||||
page.items.forEach(block => {
|
|
||||||
var blockHasLinesWithDigits = false;
|
|
||||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
|
||||||
var lastLineTextWithoutNumber;
|
var lastLineTextWithoutNumber;
|
||||||
itemsGroupedByY.forEach(lineItem => {
|
var lastLine;
|
||||||
linesCount++
|
page.items.forEach(line => {
|
||||||
var lineText = lineItem.text.replace(/\./g, '').trim();
|
var lineText = line.text.replace(/\./g, '').trim();
|
||||||
var endsWithDigit = false;
|
var endsWithDigit = false;
|
||||||
var digits = [];
|
var digits = [];
|
||||||
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
||||||
@ -49,58 +38,60 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
|
|||||||
}
|
}
|
||||||
lineText = lineText.trim();
|
lineText = lineText.trim();
|
||||||
if (endsWithDigit) {
|
if (endsWithDigit) {
|
||||||
|
endsWithDigit = true;
|
||||||
if (lastLineTextWithoutNumber) { // 2-line item ?
|
if (lastLineTextWithoutNumber) { // 2-line item ?
|
||||||
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
lineText = lastLineTextWithoutNumber + ' ' + lineText;
|
||||||
lastLineTextWithoutNumber = null;
|
lastLineTextWithoutNumber = null;
|
||||||
}
|
}
|
||||||
linesWithDigitsCount++;
|
|
||||||
blockHasLinesWithDigits = true;
|
|
||||||
pageTocLinks.push(new TocLink({
|
pageTocLinks.push(new TocLink({
|
||||||
pageNumber: parseInt(digits.join('')),
|
pageNumber: parseInt(digits.join('')),
|
||||||
textItem: new TextItem({
|
textItem: new TextItem({
|
||||||
...lineItem,
|
...line,
|
||||||
text: lineText
|
text: lineText
|
||||||
})
|
})
|
||||||
}));
|
}));
|
||||||
lineItemsWithDigits.push(new TextItem({
|
lineItemsWithDigits.push(new TextItem({
|
||||||
...lineItem,
|
...line,
|
||||||
text: lineText
|
text: lineText
|
||||||
}));
|
}));
|
||||||
|
lastLineTextWithoutNumber = null;
|
||||||
} else {
|
} else {
|
||||||
lastLineTextWithoutNumber = lineText;
|
if (!headlineItem) {
|
||||||
|
headlineItem = line;
|
||||||
|
} else {
|
||||||
|
if (lastLineTextWithoutNumber) {
|
||||||
|
unknownLines.add(lastLine);
|
||||||
}
|
}
|
||||||
});
|
lastLineTextWithoutNumber = lineText;
|
||||||
if (!blockHasLinesWithDigits) {
|
lastLine = line;
|
||||||
if (!headlineBlock) {
|
|
||||||
headlineBlock = block;
|
|
||||||
} else {
|
|
||||||
unknownBlocks.add(block);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// page has been processed
|
// page has been processed
|
||||||
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
|
||||||
tocPages.push(page.index + 1);
|
tocPages.push(page.index + 1);
|
||||||
lastTocPage = page;
|
lastTocPage = page;
|
||||||
linkLeveler.levelPageItems(pageTocLinks);
|
linkLeveler.levelPageItems(pageTocLinks);
|
||||||
tocLinks = tocLinks.concat(pageTocLinks);
|
tocLinks = tocLinks.concat(pageTocLinks);
|
||||||
|
|
||||||
const newBlocks = [];
|
const newBlocks = [];
|
||||||
page.items.forEach((block) => {
|
page.items.forEach((line) => {
|
||||||
if (!unknownBlocks.has(block)) {
|
if (!unknownLines.has(line)) {
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
line.annotation = REMOVED_ANNOTATION;
|
||||||
}
|
}
|
||||||
newBlocks.push(block);
|
newBlocks.push(line);
|
||||||
if (block === headlineBlock) {
|
if (line === headlineItem) {
|
||||||
newBlocks.push(new TextItemBlock({
|
newBlocks.push(new TextItem({
|
||||||
textItems: textCombiner.combine(block.textItems).textItems,
|
...line,
|
||||||
type: HEADLINE2,
|
type: HEADLINE2,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.items = newBlocks;
|
page.items = newBlocks;
|
||||||
|
} else {
|
||||||
|
headlineItem = null;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -112,11 +103,11 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
|
|||||||
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
||||||
var foundHeadline = false;
|
var foundHeadline = false;
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
foundHeadline = findHeadline(linkedPage, tocLink);
|
||||||
if (!foundHeadline) { // pages are off by 1 ?
|
if (!foundHeadline) { // pages are off by 1 ?
|
||||||
linkedPage = parseResult.pages[tocLink.pageNumber];
|
linkedPage = parseResult.pages[tocLink.pageNumber];
|
||||||
if (linkedPage) {
|
if (linkedPage) {
|
||||||
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
foundHeadline = findHeadline(linkedPage, tocLink);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -126,14 +117,13 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
|
|||||||
notFoundHeadlines.push(tocLink);
|
notFoundHeadlines.push(tocLink);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
lastTocPage.items.push(new TextItemBlock({
|
tocLinks.forEach(tocLink => {
|
||||||
textItems: tocLinks.map(tocLink => {
|
lastTocPage.items.push(new TextItem({
|
||||||
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
|
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||||
return tocLink.textItem
|
|
||||||
}),
|
|
||||||
type: TOC_BLOCK,
|
type: TOC_BLOCK,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const messages = [];
|
const messages = [];
|
||||||
@ -157,37 +147,25 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function findHeadline(page, tocLink, textCombiner) {
|
function findHeadline(page, tocLink) {
|
||||||
const headline = tocLink.textItem.text;
|
const headline = tocLink.textItem.text;
|
||||||
const headlineFinder = new HeadlineFinder({
|
const headlineFinder = new HeadlineFinder({
|
||||||
headline: headline
|
headline: headline
|
||||||
});
|
});
|
||||||
var blockIndex = 0;
|
var lineIndex = 0;
|
||||||
var lastBlock;
|
for ( var line of page.items ) {
|
||||||
for ( var block of page.items ) {
|
const headlineItems = headlineFinder.consume(line);
|
||||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
|
||||||
for ( var item of itemsGroupedByY ) {
|
|
||||||
const headlineItems = headlineFinder.consume(item);
|
|
||||||
if (headlineItems) {
|
if (headlineItems) {
|
||||||
const usedItems = headlineFinder.stackedTextItems;
|
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
page.items.splice(lineIndex + 1, 0, new TextItem({
|
||||||
if (usedItems.length > itemsGroupedByY.length) {
|
...headlineItems[0],
|
||||||
// 2 line headline
|
text: headline,
|
||||||
lastBlock.annotation = REMOVED_ANNOTATION;
|
|
||||||
}
|
|
||||||
page.items.splice(blockIndex + 1, 0, new TextItemBlock({
|
|
||||||
textItems: [new TextItem({
|
|
||||||
...usedItems[0],
|
|
||||||
text: headline
|
|
||||||
})],
|
|
||||||
type: headlineByLevel(tocLink.level + 2),
|
type: headlineByLevel(tocLink.level + 2),
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
lineIndex++;
|
||||||
blockIndex++;
|
|
||||||
lastBlock = block;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user