[WIP] move unused stuff in separate folder

This commit is contained in:
Johannes Zillmann 2017-03-10 06:30:18 +01:00
parent 111124fbf3
commit e2ddf0312b
10 changed files with 205 additions and 0 deletions

View File

@ -0,0 +1,28 @@
// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
// Levels are from 1..6, where 1 is the biggest headline.
// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
export default class HeaderLevelAssigner {
constructor(options) {
this.startLevel = options.startLevel;
this.paragraphHeight = options.paragraphHeight;
this.lastLevel = null;
this.lastHeight = null;
this.heightToLevel = {};
}
add(height) {
if (!this.lastHeight) {
this.lastLevel = this.startLevel;
this.heightToLevel[height] = this.startLevel;
} else {
const existingLevel = this.heightToLevel[height];
if (!existingLevel) {
//
}
}
this.lastHeight = height;
}
}

View File

@ -0,0 +1,70 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import { isNumber } from '../../functions.jsx'
export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
constructor() {
super("Detect Footnote ");
}
transform(parseResult:ParseResult) {
var nextFooterNumber = 1;
var potentialFootnoteItem;
var foundFootnotes = 0;
const newContent = parseResult.content.map(page => {
const newTextItems = [];
for (var i = 0; i < page.textItems.length; i++) {
const item = page.textItems[i];
if (potentialFootnoteItem) {
if (potentialFootnoteItem.y - item.y < item.height) {
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
item.annotation = REMOVED_ANNOTATION;
newTextItems.push(potentialFootnoteItem);
newTextItems.push(item);
newTextItems.push(new TextItem({
x: potentialFootnoteItem.x,
y: item.y,
width: potentialFootnoteItem.width + item.width,
height: item.height,
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
annotation: ADDED_ANNOTATION
}));
//TODO repsect multiline!!
nextFooterNumber++;
foundFootnotes++;
}
potentialFootnoteItem = null;
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
potentialFootnoteItem = item;
} else {
newTextItems.push(item);
}
}
return {
...page,
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
messages: ['Detected ' + foundFootnotes + ' footnotes']
});
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -0,0 +1,107 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import Headline from '../markdown/Headline.jsx';
function getMostUsedHeight(heightToOccurrence) {
var maxOccurence = 0;
var maxHeight = 0;
Object.keys(heightToOccurrence).map((element) => {
if (heightToOccurrence[element] > maxOccurence) {
maxOccurence = heightToOccurrence[element];
maxHeight = element;
}
});
return parseInt(maxHeight);
}
export default class HeadlineDetector extends Transformation {
constructor() {
super("Detect Headlines");
}
contentView() {
return ContentView.PDF;
}
// Strategy:
// - find most used height => this & every height below is paragraph
// - heights which start a page are likely to be headlines
// - maxHeigth is likely a headline
// - heights which occur on more then one page are likely to be headlines
transform(pages:PdfPage[]) {
const heightToOccurrence = {};
pages.forEach(page => {
page.textItems.forEach(item => {
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
});
});
console.debug(heightToOccurrence);
const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
console.debug("mostUsedHeight: " + mostUsedHeight);
const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
console.debug(Array.from(headlineHeights));
const headlineHeights2 = new Set();
pages.forEach(page => {
const textItems = page.textItems;
for (var i = 0; i < textItems.length; i++) {
const item = textItems[i];
if (item.height > mostUsedHeight) {
item.annotation = ADDED_ANNOTATION;
const firstItemOnPage = i == 0;
var upperDistance = 99;
if (!firstItemOnPage) {
upperDistance = textItems[i - 1].y - item.y - item.height;
}
var lowerDistance = 0;
const lastItemOnPage = i == textItems.length - 1;
if (!lastItemOnPage) {
lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
}
if (firstItemOnPage) {
console.debug("add " + item.height);
console.debug("potential headline: " + item.height + " | " + item.text);
console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
headlineHeights2.add(item.height);
}
// if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
// console.debug("remove " + item.height);
// console.debug("potential headline: " + item.height + " | " + item.text);
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
// headlineHeights.delete(item.height);
// }
// if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
// item.annotation = ADDED_ANNOTATION;
// }
// console.debug("potential headline: " + item.height + " | " + item.text);
// console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
// console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
}
}
});
console.debug(Array.from(headlineHeights2));
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}