mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 01:03:59 +01:00
Add not perfect headline detection
This commit is contained in:
parent
e90226c1d8
commit
0245ea16f1
@ -5,6 +5,7 @@ import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
|||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
|
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
import ToTextPages from './transformations/ToTextPages.jsx';
|
||||||
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
||||||
|
|
||||||
@ -22,6 +23,7 @@ export default class AppState {
|
|||||||
new CombineSameY(),
|
new CombineSameY(),
|
||||||
new DetectFootnotes(),
|
new DetectFootnotes(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
|
new HeadlineDetector(),
|
||||||
new ToTextPages(),
|
new ToTextPages(),
|
||||||
new ToSingleTextPage()];
|
new ToSingleTextPage()];
|
||||||
|
|
||||||
|
156
src/javascript/models/transformations/HeadlineDetector.jsx
Normal file
156
src/javascript/models/transformations/HeadlineDetector.jsx
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
|
||||||
|
function analyzeHeigths(pages) {
|
||||||
|
const analyzationResult = {
|
||||||
|
maxHeight: 0,
|
||||||
|
maxYPerPage: {},
|
||||||
|
heights: [],
|
||||||
|
mostUsedHeight: -1
|
||||||
|
};
|
||||||
|
const allHeights = new Set();
|
||||||
|
pages.forEach(page => {
|
||||||
|
var maxPageY = 0;
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
const height = item.height;
|
||||||
|
allHeights.add(height);
|
||||||
|
if (analyzationResult[height]) {
|
||||||
|
analyzationResult[height].repetition = analyzationResult[height].repetition + 1;
|
||||||
|
analyzationResult[height].pages.add(page.index);
|
||||||
|
} else {
|
||||||
|
analyzationResult[height] = {
|
||||||
|
repetition: 1,
|
||||||
|
pages: new Set([page.index])
|
||||||
|
};
|
||||||
|
}
|
||||||
|
maxPageY = Math.max(maxPageY, item.y);
|
||||||
|
analyzationResult.maxHeight = Math.max(analyzationResult.maxHeight, item.height);
|
||||||
|
});
|
||||||
|
analyzationResult.maxYPerPage[page.index] = maxPageY;
|
||||||
|
});
|
||||||
|
|
||||||
|
var maxRepetition = 0;
|
||||||
|
allHeights.forEach(height => {
|
||||||
|
const heightRepetition = analyzationResult[height].repetition;
|
||||||
|
analyzationResult.heights.push(height);
|
||||||
|
if (heightRepetition > maxRepetition) {
|
||||||
|
maxRepetition = heightRepetition;
|
||||||
|
analyzationResult.mostUsedHeight = height;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
analyzationResult.heights = analyzationResult.heights.sort((a, b) => a - b);
|
||||||
|
|
||||||
|
return analyzationResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findNextMajorHeight(heights, currentHeight, headlineMap) {
|
||||||
|
for (var i = currentHeight; i < heights.length; i++) {
|
||||||
|
if (headlineMap[heights[i]]) {
|
||||||
|
return heights[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export default class HeadlineDetector extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Headlines");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy:
|
||||||
|
// - find most used height => this & every height below is paragraph
|
||||||
|
// - heights which start a page are likely to be headlines
|
||||||
|
// - maxHeigth is likely a headline
|
||||||
|
// - heights which occur on more then one page are likely to be headlines
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
const heightAnalyzation = analyzeHeigths(pages);
|
||||||
|
|
||||||
|
var paragraphHeight = heightAnalyzation.mostUsedHeight + 1;
|
||||||
|
|
||||||
|
// text with more hight then the paragraph height which are on the top of the page are likely to be headlines
|
||||||
|
const likelyHeadingHeights = new Set();
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) {
|
||||||
|
likelyHeadingHeights.add(item.height);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const headlineHeights = [];
|
||||||
|
heightAnalyzation.heights.forEach(height => {
|
||||||
|
if (height == heightAnalyzation.maxHeight || (height > paragraphHeight && likelyHeadingHeights.has(height) && heightAnalyzation[height].pages.size > 1)) {
|
||||||
|
headlineHeights.push(height);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
const headlineMap = {};
|
||||||
|
headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1));
|
||||||
|
var lastMajorHeight = paragraphHeight;
|
||||||
|
var heights = heightAnalyzation.heights;
|
||||||
|
for (var i = 0; i < heights.length; i++) {
|
||||||
|
if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) {
|
||||||
|
const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap);
|
||||||
|
const distanceToLower = heights[i] - lastMajorHeight;
|
||||||
|
const distanceToHigher = nextMajorHeight - heights[i];
|
||||||
|
if (distanceToLower <= distanceToHigher) {
|
||||||
|
if (lastMajorHeight == paragraphHeight) {
|
||||||
|
paragraphHeight++;
|
||||||
|
} else {
|
||||||
|
headlineMap[heights[i]] = headlineMap[lastMajorHeight];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
headlineMap[heights[i]] = headlineMap[nextMajorHeight];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (headlineMap[heights[i]]) {
|
||||||
|
lastMajorHeight = heights[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages.map(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (item.height <= paragraphHeight) {
|
||||||
|
newTextItems.push(item);
|
||||||
|
} else {
|
||||||
|
newTextItems.push(new TextItem({
|
||||||
|
...item,
|
||||||
|
text: item.text,
|
||||||
|
annotation: new Annotation({
|
||||||
|
category: headlineMap[item.height],
|
||||||
|
color: 'green'
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
...page,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (item.annotation) {
|
||||||
|
item.text = item.annotation.category + ' ' + item.text;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user