mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-24 19:41:24 +02:00
[WIP] move unused stuff in separate folder
This commit is contained in:
parent
111124fbf3
commit
e2ddf0312b
28
src/javascript/models/HeaderLevelAssigner.jsx
Normal file
28
src/javascript/models/HeaderLevelAssigner.jsx
Normal file
@ -0,0 +1,28 @@
|
||||
|
||||
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
|
||||
// Levels are from 1..6, where 1 is the biggest headline.
|
||||
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
|
||||
export default class HeaderLevelAssigner {
|
||||
|
||||
constructor(options) {
|
||||
this.startLevel = options.startLevel;
|
||||
this.paragraphHeight = options.paragraphHeight;
|
||||
this.lastLevel = null;
|
||||
this.lastHeight = null;
|
||||
this.heightToLevel = {};
|
||||
}
|
||||
|
||||
add(height) {
|
||||
if (!this.lastHeight) {
|
||||
this.lastLevel = this.startLevel;
|
||||
this.heightToLevel[height] = this.startLevel;
|
||||
} else {
|
||||
const existingLevel = this.heightToLevel[height];
|
||||
if (!existingLevel) {
|
||||
//
|
||||
}
|
||||
}
|
||||
|
||||
this.lastHeight = height;
|
||||
}
|
||||
}
|
@ -0,0 +1,70 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { isNumber } from '../../functions.jsx'
|
||||
|
||||
export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Footnote ");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
|
||||
var nextFooterNumber = 1;
|
||||
var potentialFootnoteItem;
|
||||
var foundFootnotes = 0;
|
||||
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const newTextItems = [];
|
||||
for (var i = 0; i < page.textItems.length; i++) {
|
||||
const item = page.textItems[i];
|
||||
if (potentialFootnoteItem) {
|
||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(potentialFootnoteItem);
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
x: potentialFootnoteItem.x,
|
||||
y: item.y,
|
||||
width: potentialFootnoteItem.width + item.width,
|
||||
height: item.height,
|
||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
//TODO repsect multiline!!
|
||||
nextFooterNumber++;
|
||||
foundFootnotes++;
|
||||
}
|
||||
potentialFootnoteItem = null;
|
||||
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
||||
potentialFootnoteItem = item;
|
||||
} else {
|
||||
newTextItems.push(item);
|
||||
}
|
||||
}
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
messages: ['Detected ' + foundFootnotes + ' footnotes']
|
||||
});
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
107
src/javascript/models/transformations/old/HeadlineDetector2.jsx
Normal file
107
src/javascript/models/transformations/old/HeadlineDetector2.jsx
Normal file
@ -0,0 +1,107 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import Headline from '../markdown/Headline.jsx';
|
||||
|
||||
function getMostUsedHeight(heightToOccurrence) {
|
||||
var maxOccurence = 0;
|
||||
var maxHeight = 0;
|
||||
Object.keys(heightToOccurrence).map((element) => {
|
||||
if (heightToOccurrence[element] > maxOccurence) {
|
||||
maxOccurence = heightToOccurrence[element];
|
||||
maxHeight = element;
|
||||
}
|
||||
});
|
||||
return parseInt(maxHeight);
|
||||
}
|
||||
|
||||
|
||||
export default class HeadlineDetector extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headlines");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
// Strategy:
|
||||
// - find most used height => this & every height below is paragraph
|
||||
// - heights which start a page are likely to be headlines
|
||||
// - maxHeigth is likely a headline
|
||||
// - heights which occur on more then one page are likely to be headlines
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
const heightToOccurrence = {};
|
||||
pages.forEach(page => {
|
||||
page.textItems.forEach(item => {
|
||||
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
|
||||
});
|
||||
});
|
||||
console.debug(heightToOccurrence);
|
||||
const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
|
||||
console.debug("mostUsedHeight: " + mostUsedHeight);
|
||||
|
||||
const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
|
||||
console.debug(Array.from(headlineHeights));
|
||||
const headlineHeights2 = new Set();
|
||||
pages.forEach(page => {
|
||||
const textItems = page.textItems;
|
||||
for (var i = 0; i < textItems.length; i++) {
|
||||
const item = textItems[i];
|
||||
if (item.height > mostUsedHeight) {
|
||||
|
||||
item.annotation = ADDED_ANNOTATION;
|
||||
const firstItemOnPage = i == 0;
|
||||
var upperDistance = 99;
|
||||
if (!firstItemOnPage) {
|
||||
upperDistance = textItems[i - 1].y - item.y - item.height;
|
||||
}
|
||||
var lowerDistance = 0;
|
||||
const lastItemOnPage = i == textItems.length - 1;
|
||||
if (!lastItemOnPage) {
|
||||
lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
|
||||
}
|
||||
if (firstItemOnPage) {
|
||||
console.debug("add " + item.height);
|
||||
console.debug("potential headline: " + item.height + " | " + item.text);
|
||||
console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||
console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||
headlineHeights2.add(item.height);
|
||||
}
|
||||
|
||||
// if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
|
||||
// console.debug("remove " + item.height);
|
||||
// console.debug("potential headline: " + item.height + " | " + item.text);
|
||||
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||
// headlineHeights.delete(item.height);
|
||||
// }
|
||||
|
||||
|
||||
// if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
|
||||
// item.annotation = ADDED_ANNOTATION;
|
||||
// }
|
||||
// console.debug("potential headline: " + item.height + " | " + item.text);
|
||||
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
|
||||
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
|
||||
}
|
||||
}
|
||||
});
|
||||
console.debug(Array.from(headlineHeights2));
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user