diff --git a/src/javascript/models/transformations/old/CombineSameY.jsx b/src/javascript/models/transformations/old/CombineSameY.jsx
deleted file mode 100644
index eb3e2b8..0000000
--- a/src/javascript/models/transformations/old/CombineSameY.jsx
+++ /dev/null
@@ -1,101 +0,0 @@
-import ToTextItemTransformation from './ToTextItemTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-function combineTextItems(textItems:TextItem[]) {
- var numChars = 0;
- var sumWidth = 0;
- var maxHeight = 0;
- textItems.forEach(textItem => {
- if (textItem.width > 0) {
- numChars += textItem.text.length;
- sumWidth += textItem.width;
- }
- maxHeight = Math.max(textItem.height, maxHeight);
- });
- const avgCharacterWidth = Math.round(sumWidth / numChars);
-
- var combinedText = '';
- var sumWidthWithWhitespaces = sumWidth;
- var lastItemX;
- var lastItemWidth;
- textItems.forEach(textItem => {
- if (lastItemX && textItem.x - lastItemX - lastItemWidth > avgCharacterWidth) {
- combinedText += ' ';
- sumWidthWithWhitespaces += avgCharacterWidth;
- }
- combinedText += textItem.text;
- lastItemX = textItem.x;
- lastItemWidth = textItem.width > 0 ? textItem.width : avgCharacterWidth / 2 * textItem.text.length;
- });
-
- return new TextItem({
- x: textItems[0].x,
- y: textItems[0].y,
- width: sumWidthWithWhitespaces,
- height: maxHeight,
- text: combinedText,
- annotation: ADDED_ANNOTATION
- });
-}
-
-export default class CombineSameY extends ToTextItemTransformation {
-
- constructor() {
- super("Combine Text On Same Y");
- }
-
- transform(parseResult:ParseResult) {
- const newContent = parseResult.content.map(pdfPage => {
- const newTextItems = [];
- var textItemsWithSameY = [];
-
- var completeTextItemsWithSameY = function(textItemsWithSameY) {
- if (textItemsWithSameY.length == 1) {
- newTextItems.push(textItemsWithSameY[0]);
- } else {
- // add removed text-items
- textItemsWithSameY.forEach(textItem => {
- textItem.annotation = REMOVED_ANNOTATION;
- newTextItems.push(textItem);
- });
- newTextItems.push(combineTextItems(textItemsWithSameY));
- }
- }
-
- pdfPage.textItems.forEach(textItem => {
- if (textItemsWithSameY.length == 0 || Math.abs(textItem.y - textItemsWithSameY[textItemsWithSameY.length - 1].y) < 2) {
- //fill array
- textItemsWithSameY.push(textItem);
- } else {
- //rotate
- completeTextItemsWithSameY(textItemsWithSameY);
- textItemsWithSameY = [textItem];
- }
- });
- if (textItemsWithSameY.length > 0) {
- completeTextItemsWithSameY(textItemsWithSameY);
- }
-
- return {
- ...pdfPage,
- textItems: newTextItems
- };
- });
-
- return new ParseResult({
- ...parseResult,
- content: newContent
- });
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
deleted file mode 100644
index c5ff231..0000000
--- a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
+++ /dev/null
@@ -1,70 +0,0 @@
-import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-import { isNumber } from '../../functions.jsx'
-
-export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
-
- constructor() {
- super("Detect Footnote ");
- }
-
- transform(parseResult:ParseResult) {
-
- var nextFooterNumber = 1;
- var potentialFootnoteItem;
- var foundFootnotes = 0;
-
- const newContent = parseResult.content.map(page => {
- const newTextItems = [];
- for (var i = 0; i < page.textItems.length; i++) {
- const item = page.textItems[i];
- if (potentialFootnoteItem) {
- if (potentialFootnoteItem.y - item.y < item.height) {
- potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
- item.annotation = REMOVED_ANNOTATION;
- newTextItems.push(potentialFootnoteItem);
- newTextItems.push(item);
- newTextItems.push(new TextItem({
- x: potentialFootnoteItem.x,
- y: item.y,
- width: potentialFootnoteItem.width + item.width,
- height: item.height,
- text: '[' + potentialFootnoteItem.text + '] ' + item.text,
- annotation: ADDED_ANNOTATION
- }));
- //TODO repsect multiline!!
- nextFooterNumber++;
- foundFootnotes++;
- }
- potentialFootnoteItem = null;
- } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
- potentialFootnoteItem = item;
- } else {
- newTextItems.push(item);
- }
- }
- return {
- ...page,
- textItems: newTextItems
- };
- });
-
- return new ParseResult({
- ...parseResult,
- content: newContent,
- messages: ['Detected ' + foundFootnotes + ' footnotes']
- });
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/DetectFormats.jsx b/src/javascript/models/transformations/old/DetectFormats.jsx
deleted file mode 100644
index 184f1b1..0000000
--- a/src/javascript/models/transformations/old/DetectFormats.jsx
+++ /dev/null
@@ -1,177 +0,0 @@
-import React from 'react';
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import ParseResult from '../ParseResult.jsx';
-import { REMOVED_ANNOTATION } from '../Annotation.jsx';
-import Annotation from '../Annotation.jsx';
-
-//Detect word/sentence formats like bold, italic,...
-export default class DetectFormats extends ToPdfViewTransformation {
-
- constructor() {
- super("Detect Bold/Italic");
- }
-
- createSummaryView(parseResult:ParseResult) {
- return
- Detected
- { ' ' + parseResult.summary.foundFormats + ' ' } formats.
-
;
- }
-
-
- transform(parseResult:ParseResult) {
- var foundFormats = 0;
- const {mostUsedHeight, mostUsedFont, maxHeightFont} = parseResult.globals;
- const symbols = {
- bold: '**',
- emphasis: '_'
- }
-
- const newContent = parseResult.content.map(page => {
- const newTextItems = [];
-
- //bundle items on same Y
- const groupedItems = groupByFollowingY(page.textItems);
- var lastItem;
- var lastFormat;
-
- const addNextItem = (item, format) => {
- if (lastItem) {
- if (lastFormat !== format) {
- lastItem.text = appendSymbol(lastItem.text, symbols[lastFormat]);
- if (lastItem.annotation) {
- lastItem.annotation = newAnnotation(lastFormat);
- } else {
- lastItem.annotation = newAnnotation('End ' + lastFormat);
- }
- }
- lastItem.height = mostUsedHeight;
- newTextItems.push(lastItem);
- }
-
- if (format) {
- if (lastFormat !== format) {
- item.text = prependSymbol(item.text, symbols[format]);
- item.annotation = newAnnotation('Start ' + format);
- }
- lastItem = item;
- lastFormat = format;
- } else {
- newTextItems.push(item);
- lastItem = null;
- lastFormat = null;
- }
- };
-
-
- groupedItems.forEach(itemGroup => {
-
- //probably headline
- const differentHeightsButSameFont = itemsHaveDifferentHeightsButSameFont(itemGroup);
-
- itemGroup.forEach(item => {
- const paragraphHeighOrSlightlyBigger = item.height == mostUsedHeight || item.height == mostUsedHeight + 1;
- if (!differentHeightsButSameFont && paragraphHeighOrSlightlyBigger && item.font !== mostUsedFont) {
- // item.annotation = REMOVED_ANNOTATION;
-
- const format = item.font === maxHeightFont ? 'bold' : 'emphasis';
- addNextItem(item, format);
-
- //TODO test with womb compilation. _Th_, _ff_,... check font like SanSarif ?
- //TODO don't touch 'eingerückte' Zeichen => detect early ?
- //TODO (Maybe) could detect combined bold & emphasis like font=bold.font + emph.font !?
- foundFormats++;
- } else {
- addNextItem(item);
- }
- });
- });
-
- return {
- ...page,
- textItems: newTextItems
- };
- });
- return new ParseResult({
- ...parseResult,
- content: newContent,
- summary: {
- foundFormats: foundFormats
- }
- });
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
-
-function newAnnotation(name) {
- return new Annotation({
- category: name,
- color: 'green'
- });
-}
-
-//groups all following text items with the same Y together
-function groupByFollowingY(textItems) {
- const yArrays = [];
- var itemsWithSameY = [];
- var lastItem;
- textItems.forEach(item => {
- if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
- itemsWithSameY.push(item);
- } else {
- yArrays.push(itemsWithSameY);
- itemsWithSameY = [item];
- }
- lastItem = item;
- })
- yArrays.push(itemsWithSameY);
- return yArrays;
-}
-
-function itemsHaveDifferentHeightsButSameFont(itemGroup) {
- var heights = new Set();
- var fonts = new Set();
- itemGroup.forEach(item => {
- heights.add(item.height);
- fonts.add(item.font);
- });
- return heights.size > 1 && fonts.size == 1;
-}
-
-//TODO move to stringFunctions
-
-function prependSymbol(text, symbol) {
- if (text.charAt(0) == ' ') {
- return ' ' + symbol + removeLeadingWhitespace(text);
- }
- return symbol + text;
-}
-
-function appendSymbol(text, symbol) {
- if (text.charAt(text.length - 1) == ' ') {
- return removeTrailingWhitespace(text) + symbol + ' ';
- }
- return text + symbol;
-}
-
-function removeLeadingWhitespace(text) {
- while (text.charAt(0) == ' ') {
- text = text.substring(1, text.length);
- }
- return text;
-}
-
-function removeTrailingWhitespace(text) {
- while (text.charAt(text.length - 1) == ' ') {
- text = text.substring(0, text.length - 1);
- }
- return text;
-}
diff --git a/src/javascript/models/transformations/old/DetectHeadlines.jsx b/src/javascript/models/transformations/old/DetectHeadlines.jsx
deleted file mode 100644
index 8dd6d5e..0000000
--- a/src/javascript/models/transformations/old/DetectHeadlines.jsx
+++ /dev/null
@@ -1,182 +0,0 @@
-import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
-import ParseResult from '../ParseResult.jsx';
-import TextItemBlock from '../TextItemBlock.jsx';
-import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
-import ElementType from '../ElementType.jsx';
-import { headlineByLevel } from '../ElementType.jsx';
-
-//Detect headlines
-export default class DetectHeadlines extends ToTextItemBlockTransformation {
-
- constructor() {
- super("Detect Headlines");
- }
-
- transform(parseResult:ParseResult) {
- var foundHeadlines = 0;
- const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
-
- //Set max headlines (all headers on the same page are max level 2)
- const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight);
-
-
- var headlineHeightFlowBeforeToc = [];
- var headlineHeightsOccurenceBeforeToc = {};
- var firstPageAfterToc = 0;
- if (tocPages && tocPages.length > 0) {
- [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages);
- firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
- }
-
- const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages);
-
-
- // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
- // TODO ==> parse seperately between beforeToc and after
- // TODO ==> Kala chakra, all uppercase
- // TODO ==> TOC headlines
-
- //var topHeadlinePassed = false;
- const headlineHeightMap = {};
- const headlineSizePerLevel = {};
- var currentHeadlineLevel;
- parseResult.pages.forEach(page => {
- const newBlocks = [];
- page.items.forEach(block => {
- newBlocks.push(block);
- if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
- // const combineResult = textCombiner.combine(block.textItems);
- // if (combineResult.textItems.length == 1) {
- // const height = combineResult.textItems[0].height;
- // if (height == maxHeight) {
- // // block.annotation = REMOVED_ANNOTATION;
- // currentHeadlineLevel = 1;
- // headlineSizePerLevel[currentHeadlineLevel] = height
- // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
- // }
- // else if (currentHeadlineLevel) {
- // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
- // if (height < currentLevelSize) {
- // const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1];
- // // if(!nextLevelSize)
- // if (currentHeadlineLevel < 6) {
- // currentHeadlineLevel++;
- // }
- // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
- // headlineSizePerLevel[currentHeadlineLevel] = height;
- // } else if (height > currentLevelSize) {
- // const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1];
- // if (currentHeadlineLevel > 1) {
- // currentHeadlineLevel--;
- // }
- // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
- // headlineSizePerLevel[currentHeadlineLevel] = height;
- // } else {
- // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
- // }
- // }
- // }
- }
- });
- page.items = newBlocks;
- });
-
- const heightToOccurrence = {};
- const fontToOccurrence = {};
- // parseResult.content.forEach(page => {
- // const newBlocks = [];
- // page.blocks.forEach(block => {
- // newBlocks.push(block);
- // if (!block.type && block.textItems[0].height > mostUsedHeight) {
- // foundHeadlines++;
- // block.annotation = REMOVED_ANNOTATION;
- // const combineResult = textCombiner.combine(block.textItems);
- // const height = combineResult.textItems[0].height;
- // const font = combineResult.textItems[0].font;
- // heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1;
- // fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1;
- // newBlocks.push(new PdfBlock({
- // textItems: combineResult.textItems,
- // type: HEADLINE1,
- // annotation: ADDED_ANNOTATION,
- // parsedElements: combineResult.parsedElements
- // }));
- // }
- // });
- // page.blocks = newBlocks;
- // });
-
- return new ParseResult({
- ...parseResult,
- messages: [
- 'Found headlines: ' + foundHeadlines,
- 'Height repetition: ' + JSON.stringify(heightToOccurrence),
- 'Font repetition: ' + JSON.stringify(fontToOccurrence),
- 'Pages with max Header: ' + maxHeaderPages,
- 'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc,
- 'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc),
- 'Headline Height Flow: ' + headlineHeightFlowAfterToc,
- 'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc),
- ]
- });
- }
-
-}
-
-function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
- // Find pages with max height
- const maxHeaderPagesSet = new Set();
- pages.forEach(page => {
- page.items.forEach(block => {
- if (!block.type && block.textItems[0].height == maxHeight) {
- maxHeaderPagesSet.add(page);
- }
- });
- });
-
- // Now convert those pages to headlines
- const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
- maxHeaderPagesSet.forEach(pageWithMaxHeader => {
- pageWithMaxHeader.items.forEach(block => {
- if (block.textItems.length == 1) {
- const height = block.textItems[0].height;
- if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
- block.annotation = DETECTED_ANNOTATION;
- if (height == maxHeight) {
- block.type = ElementType.H1;
- } else {
- block.type = ElementType.H2;
- }
- }
- }
- });
- });
- return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
-}
-
-function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) {
- const headlineHeightFlow = [];
- const headlineHeightsOccurences = {};
- var lastHeadlineHeight;
- for (var i = from; i < to; i++) {
- const page = pages[i];
- if (!maxHeaderPages.includes(page.index + 1)) {
- page.items.forEach(block => {
- if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
- if (block.textItems.length == 1) {
- const height = block.textItems[0].height;
- headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
- if (!lastHeadlineHeight || height != lastHeadlineHeight) {
- headlineHeightFlow.push(height);
- //headlineFontFlow.push(combineResult.textItems[0].font)
- lastHeadlineHeight = height;
- }
- }
- }
- });
- }
- }
-
- return [headlineHeightFlow, headlineHeightsOccurences];
-}
-
diff --git a/src/javascript/models/transformations/old/HeadlineDetector.jsx b/src/javascript/models/transformations/old/HeadlineDetector.jsx
deleted file mode 100644
index 782d9d2..0000000
--- a/src/javascript/models/transformations/old/HeadlineDetector.jsx
+++ /dev/null
@@ -1,158 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-import Annotation from '../Annotation.jsx';
-
-import Headline from '../markdown/Headline.jsx';
-
-
-function analyzeHeigths(pages) {
- const analyzationResult = {
- maxHeight: 0,
- maxYPerPage: {},
- heights: [],
- mostUsedHeight: -1
- };
- const allHeights = new Set();
- pages.forEach(page => {
- var maxPageY = 0;
- page.textItems.forEach(item => {
- const height = item.height;
- allHeights.add(height);
- if (analyzationResult[height]) {
- analyzationResult[height].repetition = analyzationResult[height].repetition + 1;
- analyzationResult[height].pages.add(page.index);
- } else {
- analyzationResult[height] = {
- repetition: 1,
- pages: new Set([page.index])
- };
- }
- maxPageY = Math.max(maxPageY, item.y);
- analyzationResult.maxHeight = Math.max(analyzationResult.maxHeight, item.height);
- });
- analyzationResult.maxYPerPage[page.index] = maxPageY;
- });
-
- var maxRepetition = 0;
- allHeights.forEach(height => {
- const heightRepetition = analyzationResult[height].repetition;
- analyzationResult.heights.push(height);
- if (heightRepetition > maxRepetition) {
- maxRepetition = heightRepetition;
- analyzationResult.mostUsedHeight = height;
- }
- });
- analyzationResult.heights = analyzationResult.heights.sort((a, b) => a - b);
-
- return analyzationResult;
-}
-
-function findNextMajorHeight(heights, currentHeight, headlineLevels) {
- for (var i = currentHeight; i < heights.length; i++) {
- if (headlineLevels[heights[i]]) {
- return heights[i];
- }
- }
- throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineLevels=${headlineLevels}`;
-}
-
-
-export default class HeadlineDetector extends ToPdfViewTransformation {
-
- constructor() {
- super("Detect Headlines");
- }
-
- // Strategy:
- // - find most used height => this & every height below is paragraph
- // - heights which start a page are likely to be headlines
- // - maxHeigth is likely a headline
- // - heights which occur on more then one page are likely to be headlines
- transform(parseResult:ParseResult) {
- const heightAnalyzation = analyzeHeigths(parseResult.content);
-
- var paragraphHeight = heightAnalyzation.mostUsedHeight + 1;
-
- // text with more hight then the paragraph height which are on the top of the page are likely to be headlines
- const likelyHeadingHeights = new Set();
- parseResult.content.forEach(page => {
- page.textItems.forEach(item => {
- if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) {
- likelyHeadingHeights.add(item.height);
- }
- });
- });
-
- const headlineHeights = [];
- heightAnalyzation.heights.forEach(height => {
- if (height == heightAnalyzation.maxHeight || (height > paragraphHeight && likelyHeadingHeights.has(height) && heightAnalyzation[height].pages.size > 1)) {
- headlineHeights.push(height);
- }
- });
-
-
- const headlineLevels = {};
- headlineHeights.reverse().forEach((height, i) => headlineLevels[height] = i + 1);
- var lastMajorHeight = paragraphHeight;
- var heights = heightAnalyzation.heights;
- for (var i = 0; i < heights.length; i++) {
- if (heights[i] > paragraphHeight && !headlineLevels[heights[i]]) {
- const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineLevels);
- const distanceToLower = heights[i] - lastMajorHeight;
- const distanceToHigher = nextMajorHeight - heights[i];
- if (distanceToLower <= distanceToHigher) {
- if (lastMajorHeight == paragraphHeight) {
- paragraphHeight++;
- } else {
- headlineLevels[heights[i]] = headlineLevels[lastMajorHeight];
- }
- } else {
- headlineLevels[heights[i]] = headlineLevels[nextMajorHeight];
- }
- }
- if (headlineLevels[heights[i]]) {
- lastMajorHeight = heights[i];
- }
- }
-
- const newContent = parseResult.content.map(page => {
- const newTextItems = [];
- page.textItems.forEach(item => {
- if (item.height <= paragraphHeight) {
- newTextItems.push(item);
- } else {
- const headlineLevel = headlineLevels[item.height];
- newTextItems.push(new TextItem({
- ...item,
- text: item.text,
- annotation: new Annotation({
- category: "Headline-" + headlineLevel,
- color: 'green'
- }),
- markdownElement: new Headline({
- level: headlineLevel
- })
- }));
- }
- });
- return {
- ...page,
- textItems: newTextItems
- };
- });
-
- return new ParseResult({
- ...parseResult,
- content: newContent,
- });
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/HeadlineDetector2.jsx b/src/javascript/models/transformations/old/HeadlineDetector2.jsx
deleted file mode 100644
index 8306289..0000000
--- a/src/javascript/models/transformations/old/HeadlineDetector2.jsx
+++ /dev/null
@@ -1,107 +0,0 @@
-import Transformation from './Transformation.jsx';
-import TextItem from '../TextItem.jsx';
-import PdfPage from '../PdfPage.jsx';
-import ContentView from '../ContentView.jsx';
-import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-import Headline from '../markdown/Headline.jsx';
-
-function getMostUsedHeight(heightToOccurrence) {
- var maxOccurence = 0;
- var maxHeight = 0;
- Object.keys(heightToOccurrence).map((element) => {
- if (heightToOccurrence[element] > maxOccurence) {
- maxOccurence = heightToOccurrence[element];
- maxHeight = element;
- }
- });
- return parseInt(maxHeight);
-}
-
-
-export default class HeadlineDetector extends Transformation {
-
- constructor() {
- super("Detect Headlines");
- }
-
- contentView() {
- return ContentView.PDF;
- }
-
- // Strategy:
- // - find most used height => this & every height below is paragraph
- // - heights which start a page are likely to be headlines
- // - maxHeigth is likely a headline
- // - heights which occur on more then one page are likely to be headlines
- transform(pages:PdfPage[]) {
-
- const heightToOccurrence = {};
- pages.forEach(page => {
- page.textItems.forEach(item => {
- heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
- });
- });
- console.debug(heightToOccurrence);
- const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
- console.debug("mostUsedHeight: " + mostUsedHeight);
-
- const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
- console.debug(Array.from(headlineHeights));
- const headlineHeights2 = new Set();
- pages.forEach(page => {
- const textItems = page.textItems;
- for (var i = 0; i < textItems.length; i++) {
- const item = textItems[i];
- if (item.height > mostUsedHeight) {
-
- item.annotation = ADDED_ANNOTATION;
- const firstItemOnPage = i == 0;
- var upperDistance = 99;
- if (!firstItemOnPage) {
- upperDistance = textItems[i - 1].y - item.y - item.height;
- }
- var lowerDistance = 0;
- const lastItemOnPage = i == textItems.length - 1;
- if (!lastItemOnPage) {
- lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
- }
- if (firstItemOnPage) {
- console.debug("add " + item.height);
- console.debug("potential headline: " + item.height + " | " + item.text);
- console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
- console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
- headlineHeights2.add(item.height);
- }
-
- // if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
- // console.debug("remove " + item.height);
- // console.debug("potential headline: " + item.height + " | " + item.text);
- // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
- // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
- // headlineHeights.delete(item.height);
- // }
-
-
- // if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
- // item.annotation = ADDED_ANNOTATION;
- // }
- // console.debug("potential headline: " + item.height + " | " + item.text);
- // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
- // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
- }
- }
- });
- console.debug(Array.from(headlineHeights2));
-
- return pages;
- }
-
- processAnnotations(pages:PdfPage[]) {
- pages.forEach(page => {
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return pages;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx b/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
deleted file mode 100644
index 923348f..0000000
--- a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
+++ /dev/null
@@ -1,58 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
-
-import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
-
-// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
-// This tries to detect them and make them all uppercase.
-export default class HeadlineToUppercase extends ToPdfViewTransformation {
-
- constructor() {
- super("Headlines Uppercase");
- }
-
- transform(parseResult:ParseResult) {
- const newContent = parseResult.content.map(page => {
- const newTextItems = [];
- page.textItems.forEach(item => {
- if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
- const headline = item.text.trim();
- if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
- item.annotation = REMOVED_ANNOTATION;
- newTextItems.push(item);
- newTextItems.push(new TextItem({
- ...item,
- text: item.text.toUpperCase(),
- annotation: ADDED_ANNOTATION
- }));
- } else {
- item.annotation = UNCHANGED_ANNOTATION;
- newTextItems.push(item);
- }
- } else {
- newTextItems.push(item);
- }
- });
- return {
- ...page,
- textItems: newTextItems
- };
- });
-
- return new ParseResult({
- ...parseResult,
- content: newContent,
- });
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/ToBlockSystem.jsx b/src/javascript/models/transformations/old/ToBlockSystem.jsx
deleted file mode 100644
index 0667d35..0000000
--- a/src/javascript/models/transformations/old/ToBlockSystem.jsx
+++ /dev/null
@@ -1,74 +0,0 @@
-import React from 'react';
-import Transformation from './Transformation.jsx';
-import BlockPageView from '../../components/debug/BlockPageView.jsx';
-import ParseResult from '../ParseResult.jsx';
-import BlockPage from '../BlockPage.jsx';
-
-export default class ToBlockSystem extends Transformation {
-
- constructor() {
- super("To Block System");
- }
-
- createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
- return ;
- }
-
- transform(parseResult:ParseResult) {
- const blocks = [];
- parseResult.content.forEach(page => {
- var minDiff = 99;
- var lastY = 0;
- page.textItems.forEach(item => {
- if (lastY > 0) {
- const yDiff = lastY - item.y - item.height;
- if (yDiff > 0) {
- minDiff = Math.min(minDiff, yDiff);
- }
- }
- lastY = item.y;
- });
-
- var text;
- const rollup = (category) => {
- if (text && text.length > 0) {
- // console.debug("Push[" + blocks.length + "]: " + text);
- blocks.push({
- category: category,
- text: text
- });
- }
- text = null;
- };
-
- lastY = 0;
- page.textItems.forEach(item => {
- if (item.markdownElement) {
- rollup("Block");
- text = item.markdownElement.transformText(item.text);
- rollup(item.markdownElement.constructor.name);
- } else if (!text) {
- text = item.text;
- } else {
- const yDiff = lastY - item.y - item.height;
- if (yDiff > minDiff + 2) {
- rollup("Block");
- text = item.text;
- } else {
- text += '\n' + item.text;
- }
- }
- lastY = item.y;
- });
- rollup("Block")
- });
- return new ParseResult({
- ...parseResult,
- content: [new BlockPage({
- index: 0,
- blocks: blocks
- })],
- });
- }
-
-}
\ No newline at end of file