mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 03:51:33 +02:00
[WIP] move unused stuff in separate folder
This commit is contained in:
parent
111124fbf3
commit
e2ddf0312b
28
src/javascript/models/HeaderLevelAssigner.jsx
Normal file
28
src/javascript/models/HeaderLevelAssigner.jsx
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
|
||||||
|
// Levels are from 1..6, where 1 is the biggest headline.
|
||||||
|
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
|
||||||
|
export default class HeaderLevelAssigner {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.startLevel = options.startLevel;
|
||||||
|
this.paragraphHeight = options.paragraphHeight;
|
||||||
|
this.lastLevel = null;
|
||||||
|
this.lastHeight = null;
|
||||||
|
this.heightToLevel = {};
|
||||||
|
}
|
||||||
|
|
||||||
|
add(height) {
|
||||||
|
if (!this.lastHeight) {
|
||||||
|
this.lastLevel = this.startLevel;
|
||||||
|
this.heightToLevel[height] = this.startLevel;
|
||||||
|
} else {
|
||||||
|
const existingLevel = this.heightToLevel[height];
|
||||||
|
if (!existingLevel) {
|
||||||
|
//
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lastHeight = height;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,70 @@
|
|||||||
|
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
|
import { isNumber } from '../../functions.jsx'
|
||||||
|
|
||||||
|
export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Footnote ");
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
|
||||||
|
var nextFooterNumber = 1;
|
||||||
|
var potentialFootnoteItem;
|
||||||
|
var foundFootnotes = 0;
|
||||||
|
|
||||||
|
const newContent = parseResult.content.map(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
for (var i = 0; i < page.textItems.length; i++) {
|
||||||
|
const item = page.textItems[i];
|
||||||
|
if (potentialFootnoteItem) {
|
||||||
|
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||||
|
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
||||||
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
|
newTextItems.push(potentialFootnoteItem);
|
||||||
|
newTextItems.push(item);
|
||||||
|
newTextItems.push(new TextItem({
|
||||||
|
x: potentialFootnoteItem.x,
|
||||||
|
y: item.y,
|
||||||
|
width: potentialFootnoteItem.width + item.width,
|
||||||
|
height: item.height,
|
||||||
|
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
//TODO repsect multiline!!
|
||||||
|
nextFooterNumber++;
|
||||||
|
foundFootnotes++;
|
||||||
|
}
|
||||||
|
potentialFootnoteItem = null;
|
||||||
|
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
||||||
|
potentialFootnoteItem = item;
|
||||||
|
} else {
|
||||||
|
newTextItems.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...page,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
content: newContent,
|
||||||
|
messages: ['Detected ' + foundFootnotes + ' footnotes']
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
completeTransform(parseResult:ParseResult) {
|
||||||
|
parseResult.content.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return parseResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
107
src/javascript/models/transformations/old/HeadlineDetector2.jsx
Normal file
107
src/javascript/models/transformations/old/HeadlineDetector2.jsx
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
|
import Headline from '../markdown/Headline.jsx';
|
||||||
|
|
||||||
|
function getMostUsedHeight(heightToOccurrence) {
|
||||||
|
var maxOccurence = 0;
|
||||||
|
var maxHeight = 0;
|
||||||
|
Object.keys(heightToOccurrence).map((element) => {
|
||||||
|
if (heightToOccurrence[element] > maxOccurence) {
|
||||||
|
maxOccurence = heightToOccurrence[element];
|
||||||
|
maxHeight = element;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return parseInt(maxHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export default class HeadlineDetector extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Headlines");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy:
|
||||||
|
// - find most used height => this & every height below is paragraph
|
||||||
|
// - heights which start a page are likely to be headlines
|
||||||
|
// - maxHeigth is likely a headline
|
||||||
|
// - heights which occur on more then one page are likely to be headlines
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
|
const heightToOccurrence = {};
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
console.debug(heightToOccurrence);
|
||||||
|
const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
|
||||||
|
console.debug("mostUsedHeight: " + mostUsedHeight);
|
||||||
|
|
||||||
|
const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
|
||||||
|
console.debug(Array.from(headlineHeights));
|
||||||
|
const headlineHeights2 = new Set();
|
||||||
|
pages.forEach(page => {
|
||||||
|
const textItems = page.textItems;
|
||||||
|
for (var i = 0; i < textItems.length; i++) {
|
||||||
|
const item = textItems[i];
|
||||||
|
if (item.height > mostUsedHeight) {
|
||||||
|
|
||||||
|
item.annotation = ADDED_ANNOTATION;
|
||||||
|
const firstItemOnPage = i == 0;
|
||||||
|
var upperDistance = 99;
|
||||||
|
if (!firstItemOnPage) {
|
||||||
|
upperDistance = textItems[i - 1].y - item.y - item.height;
|
||||||
|
}
|
||||||
|
var lowerDistance = 0;
|
||||||
|
const lastItemOnPage = i == textItems.length - 1;
|
||||||
|
if (!lastItemOnPage) {
|
||||||
|
lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
|
||||||
|
}
|
||||||
|
if (firstItemOnPage) {
|
||||||
|
console.debug("add " + item.height);
|
||||||
|
console.debug("potential headline: " + item.height + " | " + item.text);
|
||||||
|
console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||||
|
console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||||
|
headlineHeights2.add(item.height);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
|
||||||
|
// console.debug("remove " + item.height);
|
||||||
|
// console.debug("potential headline: " + item.height + " | " + item.text);
|
||||||
|
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||||
|
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||||
|
// headlineHeights.delete(item.height);
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
// if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
|
||||||
|
// item.annotation = ADDED_ANNOTATION;
|
||||||
|
// }
|
||||||
|
// console.debug("potential headline: " + item.height + " | " + item.text);
|
||||||
|
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||||
|
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
console.debug(Array.from(headlineHeights2));
|
||||||
|
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user