From e2ddf0312b9167d32ee02c619eb717de6876afe4 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Fri, 10 Mar 2017 06:30:18 +0100 Subject: [PATCH] [WIP] move unused stuff in separate folder --- src/javascript/models/HeaderLevelAssigner.jsx | 28 +++++ .../{ => old}/CombineSameY.jsx | 0 .../transformations/old/DetectFootnoteOld.jsx | 70 ++++++++++++ .../{ => old}/DetectFormats.jsx | 0 .../transformations/{ => old}/DetectLinks.jsx | 0 .../{ => old}/HeadlineDetector.jsx | 0 .../transformations/old/HeadlineDetector2.jsx | 107 ++++++++++++++++++ .../{ => old}/HeadlineToUppercase.jsx | 0 .../{ => old}/RemoveWhitespaces.jsx | 0 .../{ => old}/ToBlockSystem.jsx | 0 10 files changed, 205 insertions(+) create mode 100644 src/javascript/models/HeaderLevelAssigner.jsx rename src/javascript/models/transformations/{ => old}/CombineSameY.jsx (100%) create mode 100644 src/javascript/models/transformations/old/DetectFootnoteOld.jsx rename src/javascript/models/transformations/{ => old}/DetectFormats.jsx (100%) rename src/javascript/models/transformations/{ => old}/DetectLinks.jsx (100%) rename src/javascript/models/transformations/{ => old}/HeadlineDetector.jsx (100%) create mode 100644 src/javascript/models/transformations/old/HeadlineDetector2.jsx rename src/javascript/models/transformations/{ => old}/HeadlineToUppercase.jsx (100%) rename src/javascript/models/transformations/{ => old}/RemoveWhitespaces.jsx (100%) rename src/javascript/models/transformations/{ => old}/ToBlockSystem.jsx (100%) diff --git a/src/javascript/models/HeaderLevelAssigner.jsx b/src/javascript/models/HeaderLevelAssigner.jsx new file mode 100644 index 0000000..81bffb8 --- /dev/null +++ b/src/javascript/models/HeaderLevelAssigner.jsx @@ -0,0 +1,28 @@ + +// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header +// Levels are from 1..6, where 1 is the biggest headline. +// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given. +export default class HeaderLevelAssigner { + + constructor(options) { + this.startLevel = options.startLevel; + this.paragraphHeight = options.paragraphHeight; + this.lastLevel = null; + this.lastHeight = null; + this.heightToLevel = {}; + } + + add(height) { + if (!this.lastHeight) { + this.lastLevel = this.startLevel; + this.heightToLevel[height] = this.startLevel; + } else { + const existingLevel = this.heightToLevel[height]; + if (!existingLevel) { + // + } + } + + this.lastHeight = height; + } +} \ No newline at end of file diff --git a/src/javascript/models/transformations/CombineSameY.jsx b/src/javascript/models/transformations/old/CombineSameY.jsx similarity index 100% rename from src/javascript/models/transformations/CombineSameY.jsx rename to src/javascript/models/transformations/old/CombineSameY.jsx diff --git a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx new file mode 100644 index 0000000..c5ff231 --- /dev/null +++ b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx @@ -0,0 +1,70 @@ +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import TextItem from '../TextItem.jsx'; +import ParseResult from '../ParseResult.jsx'; +import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; + +import { isNumber } from '../../functions.jsx' + +export default class DetectFootnoteOld extends ToTextItemBlockTransformation { + + constructor() { + super("Detect Footnote "); + } + + transform(parseResult:ParseResult) { + + var nextFooterNumber = 1; + var potentialFootnoteItem; + var foundFootnotes = 0; + + const newContent = parseResult.content.map(page => { + const newTextItems = []; + for (var i = 0; i < page.textItems.length; i++) { + const item = page.textItems[i]; + if (potentialFootnoteItem) { + if (potentialFootnoteItem.y - item.y < item.height) { + potentialFootnoteItem.annotation = REMOVED_ANNOTATION; + item.annotation = REMOVED_ANNOTATION; + newTextItems.push(potentialFootnoteItem); + newTextItems.push(item); + newTextItems.push(new TextItem({ + x: potentialFootnoteItem.x, + y: item.y, + width: potentialFootnoteItem.width + item.width, + height: item.height, + text: '[' + potentialFootnoteItem.text + '] ' + item.text, + annotation: ADDED_ANNOTATION + })); + //TODO repsect multiline!! + nextFooterNumber++; + foundFootnotes++; + } + potentialFootnoteItem = null; + } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) { + potentialFootnoteItem = item; + } else { + newTextItems.push(item); + } + } + return { + ...page, + textItems: newTextItems + }; + }); + + return new ParseResult({ + ...parseResult, + content: newContent, + messages: ['Detected ' + foundFootnotes + ' footnotes'] + }); + } + + completeTransform(parseResult:ParseResult) { + parseResult.content.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return parseResult; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/DetectFormats.jsx b/src/javascript/models/transformations/old/DetectFormats.jsx similarity index 100% rename from src/javascript/models/transformations/DetectFormats.jsx rename to src/javascript/models/transformations/old/DetectFormats.jsx diff --git a/src/javascript/models/transformations/DetectLinks.jsx b/src/javascript/models/transformations/old/DetectLinks.jsx similarity index 100% rename from src/javascript/models/transformations/DetectLinks.jsx rename to src/javascript/models/transformations/old/DetectLinks.jsx diff --git a/src/javascript/models/transformations/HeadlineDetector.jsx b/src/javascript/models/transformations/old/HeadlineDetector.jsx similarity index 100% rename from src/javascript/models/transformations/HeadlineDetector.jsx rename to src/javascript/models/transformations/old/HeadlineDetector.jsx diff --git a/src/javascript/models/transformations/old/HeadlineDetector2.jsx b/src/javascript/models/transformations/old/HeadlineDetector2.jsx new file mode 100644 index 0000000..8306289 --- /dev/null +++ b/src/javascript/models/transformations/old/HeadlineDetector2.jsx @@ -0,0 +1,107 @@ +import Transformation from './Transformation.jsx'; +import TextItem from '../TextItem.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; +import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; + +import Headline from '../markdown/Headline.jsx'; + +function getMostUsedHeight(heightToOccurrence) { + var maxOccurence = 0; + var maxHeight = 0; + Object.keys(heightToOccurrence).map((element) => { + if (heightToOccurrence[element] > maxOccurence) { + maxOccurence = heightToOccurrence[element]; + maxHeight = element; + } + }); + return parseInt(maxHeight); +} + + +export default class HeadlineDetector extends Transformation { + + constructor() { + super("Detect Headlines"); + } + + contentView() { + return ContentView.PDF; + } + + // Strategy: + // - find most used height => this & every height below is paragraph + // - heights which start a page are likely to be headlines + // - maxHeigth is likely a headline + // - heights which occur on more then one page are likely to be headlines + transform(pages:PdfPage[]) { + + const heightToOccurrence = {}; + pages.forEach(page => { + page.textItems.forEach(item => { + heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1; + }); + }); + console.debug(heightToOccurrence); + const mostUsedHeight = getMostUsedHeight(heightToOccurrence); + console.debug("mostUsedHeight: " + mostUsedHeight); + + const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem))); + console.debug(Array.from(headlineHeights)); + const headlineHeights2 = new Set(); + pages.forEach(page => { + const textItems = page.textItems; + for (var i = 0; i < textItems.length; i++) { + const item = textItems[i]; + if (item.height > mostUsedHeight) { + + item.annotation = ADDED_ANNOTATION; + const firstItemOnPage = i == 0; + var upperDistance = 99; + if (!firstItemOnPage) { + upperDistance = textItems[i - 1].y - item.y - item.height; + } + var lowerDistance = 0; + const lastItemOnPage = i == textItems.length - 1; + if (!lastItemOnPage) { + lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height; + } + if (firstItemOnPage) { + console.debug("add " + item.height); + console.debug("potential headline: " + item.height + " | " + item.text); + console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); + console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); + headlineHeights2.add(item.height); + } + + // if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) { + // console.debug("remove " + item.height); + // console.debug("potential headline: " + item.height + " | " + item.text); + // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); + // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); + // headlineHeights.delete(item.height); + // } + + + // if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) { + // item.annotation = ADDED_ANNOTATION; + // } + // console.debug("potential headline: " + item.height + " | " + item.text); + // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); + // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); + } + } + }); + console.debug(Array.from(headlineHeights2)); + + return pages; + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return pages; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/HeadlineToUppercase.jsx b/src/javascript/models/transformations/old/HeadlineToUppercase.jsx similarity index 100% rename from src/javascript/models/transformations/HeadlineToUppercase.jsx rename to src/javascript/models/transformations/old/HeadlineToUppercase.jsx diff --git a/src/javascript/models/transformations/RemoveWhitespaces.jsx b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx similarity index 100% rename from src/javascript/models/transformations/RemoveWhitespaces.jsx rename to src/javascript/models/transformations/old/RemoveWhitespaces.jsx diff --git a/src/javascript/models/transformations/ToBlockSystem.jsx b/src/javascript/models/transformations/old/ToBlockSystem.jsx similarity index 100% rename from src/javascript/models/transformations/ToBlockSystem.jsx rename to src/javascript/models/transformations/old/ToBlockSystem.jsx