diff --git a/src/javascript/models/transformations/old/CombineSameY.jsx b/src/javascript/models/transformations/old/CombineSameY.jsx deleted file mode 100644 index eb3e2b8..0000000 --- a/src/javascript/models/transformations/old/CombineSameY.jsx +++ /dev/null @@ -1,101 +0,0 @@ -import ToTextItemTransformation from './ToTextItemTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; - -function combineTextItems(textItems:TextItem[]) { - var numChars = 0; - var sumWidth = 0; - var maxHeight = 0; - textItems.forEach(textItem => { - if (textItem.width > 0) { - numChars += textItem.text.length; - sumWidth += textItem.width; - } - maxHeight = Math.max(textItem.height, maxHeight); - }); - const avgCharacterWidth = Math.round(sumWidth / numChars); - - var combinedText = ''; - var sumWidthWithWhitespaces = sumWidth; - var lastItemX; - var lastItemWidth; - textItems.forEach(textItem => { - if (lastItemX && textItem.x - lastItemX - lastItemWidth > avgCharacterWidth) { - combinedText += ' '; - sumWidthWithWhitespaces += avgCharacterWidth; - } - combinedText += textItem.text; - lastItemX = textItem.x; - lastItemWidth = textItem.width > 0 ? textItem.width : avgCharacterWidth / 2 * textItem.text.length; - }); - - return new TextItem({ - x: textItems[0].x, - y: textItems[0].y, - width: sumWidthWithWhitespaces, - height: maxHeight, - text: combinedText, - annotation: ADDED_ANNOTATION - }); -} - -export default class CombineSameY extends ToTextItemTransformation { - - constructor() { - super("Combine Text On Same Y"); - } - - transform(parseResult:ParseResult) { - const newContent = parseResult.content.map(pdfPage => { - const newTextItems = []; - var textItemsWithSameY = []; - - var completeTextItemsWithSameY = function(textItemsWithSameY) { - if (textItemsWithSameY.length == 1) { - newTextItems.push(textItemsWithSameY[0]); - } else { - // add removed text-items - textItemsWithSameY.forEach(textItem => { - textItem.annotation = REMOVED_ANNOTATION; - newTextItems.push(textItem); - }); - newTextItems.push(combineTextItems(textItemsWithSameY)); - } - } - - pdfPage.textItems.forEach(textItem => { - if (textItemsWithSameY.length == 0 || Math.abs(textItem.y - textItemsWithSameY[textItemsWithSameY.length - 1].y) < 2) { - //fill array - textItemsWithSameY.push(textItem); - } else { - //rotate - completeTextItemsWithSameY(textItemsWithSameY); - textItemsWithSameY = [textItem]; - } - }); - if (textItemsWithSameY.length > 0) { - completeTextItemsWithSameY(textItemsWithSameY); - } - - return { - ...pdfPage, - textItems: newTextItems - }; - }); - - return new ParseResult({ - ...parseResult, - content: newContent - }); - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx deleted file mode 100644 index c5ff231..0000000 --- a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx +++ /dev/null @@ -1,70 +0,0 @@ -import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; - -import { isNumber } from '../../functions.jsx' - -export default class DetectFootnoteOld extends ToTextItemBlockTransformation { - - constructor() { - super("Detect Footnote "); - } - - transform(parseResult:ParseResult) { - - var nextFooterNumber = 1; - var potentialFootnoteItem; - var foundFootnotes = 0; - - const newContent = parseResult.content.map(page => { - const newTextItems = []; - for (var i = 0; i < page.textItems.length; i++) { - const item = page.textItems[i]; - if (potentialFootnoteItem) { - if (potentialFootnoteItem.y - item.y < item.height) { - potentialFootnoteItem.annotation = REMOVED_ANNOTATION; - item.annotation = REMOVED_ANNOTATION; - newTextItems.push(potentialFootnoteItem); - newTextItems.push(item); - newTextItems.push(new TextItem({ - x: potentialFootnoteItem.x, - y: item.y, - width: potentialFootnoteItem.width + item.width, - height: item.height, - text: '[' + potentialFootnoteItem.text + '] ' + item.text, - annotation: ADDED_ANNOTATION - })); - //TODO repsect multiline!! - nextFooterNumber++; - foundFootnotes++; - } - potentialFootnoteItem = null; - } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) { - potentialFootnoteItem = item; - } else { - newTextItems.push(item); - } - } - return { - ...page, - textItems: newTextItems - }; - }); - - return new ParseResult({ - ...parseResult, - content: newContent, - messages: ['Detected ' + foundFootnotes + ' footnotes'] - }); - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/DetectFormats.jsx b/src/javascript/models/transformations/old/DetectFormats.jsx deleted file mode 100644 index 184f1b1..0000000 --- a/src/javascript/models/transformations/old/DetectFormats.jsx +++ /dev/null @@ -1,177 +0,0 @@ -import React from 'react'; -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import ParseResult from '../ParseResult.jsx'; -import { REMOVED_ANNOTATION } from '../Annotation.jsx'; -import Annotation from '../Annotation.jsx'; - -//Detect word/sentence formats like bold, italic,... -export default class DetectFormats extends ToPdfViewTransformation { - - constructor() { - super("Detect Bold/Italic"); - } - - createSummaryView(parseResult:ParseResult) { - return
- Detected - { ' ' + parseResult.summary.foundFormats + ' ' } formats. -
; - } - - - transform(parseResult:ParseResult) { - var foundFormats = 0; - const {mostUsedHeight, mostUsedFont, maxHeightFont} = parseResult.globals; - const symbols = { - bold: '**', - emphasis: '_' - } - - const newContent = parseResult.content.map(page => { - const newTextItems = []; - - //bundle items on same Y - const groupedItems = groupByFollowingY(page.textItems); - var lastItem; - var lastFormat; - - const addNextItem = (item, format) => { - if (lastItem) { - if (lastFormat !== format) { - lastItem.text = appendSymbol(lastItem.text, symbols[lastFormat]); - if (lastItem.annotation) { - lastItem.annotation = newAnnotation(lastFormat); - } else { - lastItem.annotation = newAnnotation('End ' + lastFormat); - } - } - lastItem.height = mostUsedHeight; - newTextItems.push(lastItem); - } - - if (format) { - if (lastFormat !== format) { - item.text = prependSymbol(item.text, symbols[format]); - item.annotation = newAnnotation('Start ' + format); - } - lastItem = item; - lastFormat = format; - } else { - newTextItems.push(item); - lastItem = null; - lastFormat = null; - } - }; - - - groupedItems.forEach(itemGroup => { - - //probably headline - const differentHeightsButSameFont = itemsHaveDifferentHeightsButSameFont(itemGroup); - - itemGroup.forEach(item => { - const paragraphHeighOrSlightlyBigger = item.height == mostUsedHeight || item.height == mostUsedHeight + 1; - if (!differentHeightsButSameFont && paragraphHeighOrSlightlyBigger && item.font !== mostUsedFont) { - // item.annotation = REMOVED_ANNOTATION; - - const format = item.font === maxHeightFont ? 'bold' : 'emphasis'; - addNextItem(item, format); - - //TODO test with womb compilation. _Th_, _ff_,... check font like SanSarif ? - //TODO don't touch 'eingerückte' Zeichen => detect early ? - //TODO (Maybe) could detect combined bold & emphasis like font=bold.font + emph.font !? - foundFormats++; - } else { - addNextItem(item); - } - }); - }); - - return { - ...page, - textItems: newTextItems - }; - }); - return new ParseResult({ - ...parseResult, - content: newContent, - summary: { - foundFormats: foundFormats - } - }); - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} - -function newAnnotation(name) { - return new Annotation({ - category: name, - color: 'green' - }); -} - -//groups all following text items with the same Y together -function groupByFollowingY(textItems) { - const yArrays = []; - var itemsWithSameY = []; - var lastItem; - textItems.forEach(item => { - if (itemsWithSameY.length == 0 || item.y == lastItem.y) { - itemsWithSameY.push(item); - } else { - yArrays.push(itemsWithSameY); - itemsWithSameY = [item]; - } - lastItem = item; - }) - yArrays.push(itemsWithSameY); - return yArrays; -} - -function itemsHaveDifferentHeightsButSameFont(itemGroup) { - var heights = new Set(); - var fonts = new Set(); - itemGroup.forEach(item => { - heights.add(item.height); - fonts.add(item.font); - }); - return heights.size > 1 && fonts.size == 1; -} - -//TODO move to stringFunctions - -function prependSymbol(text, symbol) { - if (text.charAt(0) == ' ') { - return ' ' + symbol + removeLeadingWhitespace(text); - } - return symbol + text; -} - -function appendSymbol(text, symbol) { - if (text.charAt(text.length - 1) == ' ') { - return removeTrailingWhitespace(text) + symbol + ' '; - } - return text + symbol; -} - -function removeLeadingWhitespace(text) { - while (text.charAt(0) == ' ') { - text = text.substring(1, text.length); - } - return text; -} - -function removeTrailingWhitespace(text) { - while (text.charAt(text.length - 1) == ' ') { - text = text.substring(0, text.length - 1); - } - return text; -} diff --git a/src/javascript/models/transformations/old/DetectHeadlines.jsx b/src/javascript/models/transformations/old/DetectHeadlines.jsx deleted file mode 100644 index 8dd6d5e..0000000 --- a/src/javascript/models/transformations/old/DetectHeadlines.jsx +++ /dev/null @@ -1,182 +0,0 @@ -import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; -import ParseResult from '../ParseResult.jsx'; -import TextItemBlock from '../TextItemBlock.jsx'; -import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx'; -import ElementType from '../ElementType.jsx'; -import { headlineByLevel } from '../ElementType.jsx'; - -//Detect headlines -export default class DetectHeadlines extends ToTextItemBlockTransformation { - - constructor() { - super("Detect Headlines"); - } - - transform(parseResult:ParseResult) { - var foundHeadlines = 0; - const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals; - - //Set max headlines (all headers on the same page are max level 2) - const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight); - - - var headlineHeightFlowBeforeToc = []; - var headlineHeightsOccurenceBeforeToc = {}; - var firstPageAfterToc = 0; - if (tocPages && tocPages.length > 0) { - [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages); - firstPageAfterToc = tocPages[tocPages.length - 1] + 1; - } - - const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages); - - - // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd) - // TODO ==> parse seperately between beforeToc and after - // TODO ==> Kala chakra, all uppercase - // TODO ==> TOC headlines - - //var topHeadlinePassed = false; - const headlineHeightMap = {}; - const headlineSizePerLevel = {}; - var currentHeadlineLevel; - parseResult.pages.forEach(page => { - const newBlocks = []; - page.items.forEach(block => { - newBlocks.push(block); - if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { - // const combineResult = textCombiner.combine(block.textItems); - // if (combineResult.textItems.length == 1) { - // const height = combineResult.textItems[0].height; - // if (height == maxHeight) { - // // block.annotation = REMOVED_ANNOTATION; - // currentHeadlineLevel = 1; - // headlineSizePerLevel[currentHeadlineLevel] = height - // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); - // } - // else if (currentHeadlineLevel) { - // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel]; - // if (height < currentLevelSize) { - // const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1]; - // // if(!nextLevelSize) - // if (currentHeadlineLevel < 6) { - // currentHeadlineLevel++; - // } - // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); - // headlineSizePerLevel[currentHeadlineLevel] = height; - // } else if (height > currentLevelSize) { - // const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1]; - // if (currentHeadlineLevel > 1) { - // currentHeadlineLevel--; - // } - // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); - // headlineSizePerLevel[currentHeadlineLevel] = height; - // } else { - // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); - // } - // } - // } - } - }); - page.items = newBlocks; - }); - - const heightToOccurrence = {}; - const fontToOccurrence = {}; - // parseResult.content.forEach(page => { - // const newBlocks = []; - // page.blocks.forEach(block => { - // newBlocks.push(block); - // if (!block.type && block.textItems[0].height > mostUsedHeight) { - // foundHeadlines++; - // block.annotation = REMOVED_ANNOTATION; - // const combineResult = textCombiner.combine(block.textItems); - // const height = combineResult.textItems[0].height; - // const font = combineResult.textItems[0].font; - // heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1; - // fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1; - // newBlocks.push(new PdfBlock({ - // textItems: combineResult.textItems, - // type: HEADLINE1, - // annotation: ADDED_ANNOTATION, - // parsedElements: combineResult.parsedElements - // })); - // } - // }); - // page.blocks = newBlocks; - // }); - - return new ParseResult({ - ...parseResult, - messages: [ - 'Found headlines: ' + foundHeadlines, - 'Height repetition: ' + JSON.stringify(heightToOccurrence), - 'Font repetition: ' + JSON.stringify(fontToOccurrence), - 'Pages with max Header: ' + maxHeaderPages, - 'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc, - 'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc), - 'Headline Height Flow: ' + headlineHeightFlowAfterToc, - 'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc), - ] - }); - } - -} - -function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { - // Find pages with max height - const maxHeaderPagesSet = new Set(); - pages.forEach(page => { - page.items.forEach(block => { - if (!block.type && block.textItems[0].height == maxHeight) { - maxHeaderPagesSet.add(page); - } - }); - }); - - // Now convert those pages to headlines - const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); - maxHeaderPagesSet.forEach(pageWithMaxHeader => { - pageWithMaxHeader.items.forEach(block => { - if (block.textItems.length == 1) { - const height = block.textItems[0].height; - if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) { - block.annotation = DETECTED_ANNOTATION; - if (height == maxHeight) { - block.type = ElementType.H1; - } else { - block.type = ElementType.H2; - } - } - } - }); - }); - return Array.from(maxHeaderPagesSet).map(page => page.index + 1); -} - -function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) { - const headlineHeightFlow = []; - const headlineHeightsOccurences = {}; - var lastHeadlineHeight; - for (var i = from; i < to; i++) { - const page = pages[i]; - if (!maxHeaderPages.includes(page.index + 1)) { - page.items.forEach(block => { - if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { - if (block.textItems.length == 1) { - const height = block.textItems[0].height; - headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ; - if (!lastHeadlineHeight || height != lastHeadlineHeight) { - headlineHeightFlow.push(height); - //headlineFontFlow.push(combineResult.textItems[0].font) - lastHeadlineHeight = height; - } - } - } - }); - } - } - - return [headlineHeightFlow, headlineHeightsOccurences]; -} - diff --git a/src/javascript/models/transformations/old/HeadlineDetector.jsx b/src/javascript/models/transformations/old/HeadlineDetector.jsx deleted file mode 100644 index 782d9d2..0000000 --- a/src/javascript/models/transformations/old/HeadlineDetector.jsx +++ /dev/null @@ -1,158 +0,0 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; -import Annotation from '../Annotation.jsx'; - -import Headline from '../markdown/Headline.jsx'; - - -function analyzeHeigths(pages) { - const analyzationResult = { - maxHeight: 0, - maxYPerPage: {}, - heights: [], - mostUsedHeight: -1 - }; - const allHeights = new Set(); - pages.forEach(page => { - var maxPageY = 0; - page.textItems.forEach(item => { - const height = item.height; - allHeights.add(height); - if (analyzationResult[height]) { - analyzationResult[height].repetition = analyzationResult[height].repetition + 1; - analyzationResult[height].pages.add(page.index); - } else { - analyzationResult[height] = { - repetition: 1, - pages: new Set([page.index]) - }; - } - maxPageY = Math.max(maxPageY, item.y); - analyzationResult.maxHeight = Math.max(analyzationResult.maxHeight, item.height); - }); - analyzationResult.maxYPerPage[page.index] = maxPageY; - }); - - var maxRepetition = 0; - allHeights.forEach(height => { - const heightRepetition = analyzationResult[height].repetition; - analyzationResult.heights.push(height); - if (heightRepetition > maxRepetition) { - maxRepetition = heightRepetition; - analyzationResult.mostUsedHeight = height; - } - }); - analyzationResult.heights = analyzationResult.heights.sort((a, b) => a - b); - - return analyzationResult; -} - -function findNextMajorHeight(heights, currentHeight, headlineLevels) { - for (var i = currentHeight; i < heights.length; i++) { - if (headlineLevels[heights[i]]) { - return heights[i]; - } - } - throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineLevels=${headlineLevels}`; -} - - -export default class HeadlineDetector extends ToPdfViewTransformation { - - constructor() { - super("Detect Headlines"); - } - - // Strategy: - // - find most used height => this & every height below is paragraph - // - heights which start a page are likely to be headlines - // - maxHeigth is likely a headline - // - heights which occur on more then one page are likely to be headlines - transform(parseResult:ParseResult) { - const heightAnalyzation = analyzeHeigths(parseResult.content); - - var paragraphHeight = heightAnalyzation.mostUsedHeight + 1; - - // text with more hight then the paragraph height which are on the top of the page are likely to be headlines - const likelyHeadingHeights = new Set(); - parseResult.content.forEach(page => { - page.textItems.forEach(item => { - if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) { - likelyHeadingHeights.add(item.height); - } - }); - }); - - const headlineHeights = []; - heightAnalyzation.heights.forEach(height => { - if (height == heightAnalyzation.maxHeight || (height > paragraphHeight && likelyHeadingHeights.has(height) && heightAnalyzation[height].pages.size > 1)) { - headlineHeights.push(height); - } - }); - - - const headlineLevels = {}; - headlineHeights.reverse().forEach((height, i) => headlineLevels[height] = i + 1); - var lastMajorHeight = paragraphHeight; - var heights = heightAnalyzation.heights; - for (var i = 0; i < heights.length; i++) { - if (heights[i] > paragraphHeight && !headlineLevels[heights[i]]) { - const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineLevels); - const distanceToLower = heights[i] - lastMajorHeight; - const distanceToHigher = nextMajorHeight - heights[i]; - if (distanceToLower <= distanceToHigher) { - if (lastMajorHeight == paragraphHeight) { - paragraphHeight++; - } else { - headlineLevels[heights[i]] = headlineLevels[lastMajorHeight]; - } - } else { - headlineLevels[heights[i]] = headlineLevels[nextMajorHeight]; - } - } - if (headlineLevels[heights[i]]) { - lastMajorHeight = heights[i]; - } - } - - const newContent = parseResult.content.map(page => { - const newTextItems = []; - page.textItems.forEach(item => { - if (item.height <= paragraphHeight) { - newTextItems.push(item); - } else { - const headlineLevel = headlineLevels[item.height]; - newTextItems.push(new TextItem({ - ...item, - text: item.text, - annotation: new Annotation({ - category: "Headline-" + headlineLevel, - color: 'green' - }), - markdownElement: new Headline({ - level: headlineLevel - }) - })); - } - }); - return { - ...page, - textItems: newTextItems - }; - }); - - return new ParseResult({ - ...parseResult, - content: newContent, - }); - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/HeadlineDetector2.jsx b/src/javascript/models/transformations/old/HeadlineDetector2.jsx deleted file mode 100644 index 8306289..0000000 --- a/src/javascript/models/transformations/old/HeadlineDetector2.jsx +++ /dev/null @@ -1,107 +0,0 @@ -import Transformation from './Transformation.jsx'; -import TextItem from '../TextItem.jsx'; -import PdfPage from '../PdfPage.jsx'; -import ContentView from '../ContentView.jsx'; -import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; - -import Headline from '../markdown/Headline.jsx'; - -function getMostUsedHeight(heightToOccurrence) { - var maxOccurence = 0; - var maxHeight = 0; - Object.keys(heightToOccurrence).map((element) => { - if (heightToOccurrence[element] > maxOccurence) { - maxOccurence = heightToOccurrence[element]; - maxHeight = element; - } - }); - return parseInt(maxHeight); -} - - -export default class HeadlineDetector extends Transformation { - - constructor() { - super("Detect Headlines"); - } - - contentView() { - return ContentView.PDF; - } - - // Strategy: - // - find most used height => this & every height below is paragraph - // - heights which start a page are likely to be headlines - // - maxHeigth is likely a headline - // - heights which occur on more then one page are likely to be headlines - transform(pages:PdfPage[]) { - - const heightToOccurrence = {}; - pages.forEach(page => { - page.textItems.forEach(item => { - heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1; - }); - }); - console.debug(heightToOccurrence); - const mostUsedHeight = getMostUsedHeight(heightToOccurrence); - console.debug("mostUsedHeight: " + mostUsedHeight); - - const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem))); - console.debug(Array.from(headlineHeights)); - const headlineHeights2 = new Set(); - pages.forEach(page => { - const textItems = page.textItems; - for (var i = 0; i < textItems.length; i++) { - const item = textItems[i]; - if (item.height > mostUsedHeight) { - - item.annotation = ADDED_ANNOTATION; - const firstItemOnPage = i == 0; - var upperDistance = 99; - if (!firstItemOnPage) { - upperDistance = textItems[i - 1].y - item.y - item.height; - } - var lowerDistance = 0; - const lastItemOnPage = i == textItems.length - 1; - if (!lastItemOnPage) { - lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height; - } - if (firstItemOnPage) { - console.debug("add " + item.height); - console.debug("potential headline: " + item.height + " | " + item.text); - console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); - console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); - headlineHeights2.add(item.height); - } - - // if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) { - // console.debug("remove " + item.height); - // console.debug("potential headline: " + item.height + " | " + item.text); - // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); - // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); - // headlineHeights.delete(item.height); - // } - - - // if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) { - // item.annotation = ADDED_ANNOTATION; - // } - // console.debug("potential headline: " + item.height + " | " + item.text); - // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage); - // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance); - } - } - }); - console.debug(Array.from(headlineHeights2)); - - return pages; - } - - processAnnotations(pages:PdfPage[]) { - pages.forEach(page => { - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return pages; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx b/src/javascript/models/transformations/old/HeadlineToUppercase.jsx deleted file mode 100644 index 923348f..0000000 --- a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx +++ /dev/null @@ -1,58 +0,0 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; -import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx'; - -import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx' - -// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'. -// This tries to detect them and make them all uppercase. -export default class HeadlineToUppercase extends ToPdfViewTransformation { - - constructor() { - super("Headlines Uppercase"); - } - - transform(parseResult:ParseResult) { - const newContent = parseResult.content.map(page => { - const newTextItems = []; - page.textItems.forEach(item => { - if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') { - const headline = item.text.trim(); - if (hasUpperCaseCharacterInMiddleOfWord(headline)) { - item.annotation = REMOVED_ANNOTATION; - newTextItems.push(item); - newTextItems.push(new TextItem({ - ...item, - text: item.text.toUpperCase(), - annotation: ADDED_ANNOTATION - })); - } else { - item.annotation = UNCHANGED_ANNOTATION; - newTextItems.push(item); - } - } else { - newTextItems.push(item); - } - }); - return { - ...page, - textItems: newTextItems - }; - }); - - return new ParseResult({ - ...parseResult, - content: newContent, - }); - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/ToBlockSystem.jsx b/src/javascript/models/transformations/old/ToBlockSystem.jsx deleted file mode 100644 index 0667d35..0000000 --- a/src/javascript/models/transformations/old/ToBlockSystem.jsx +++ /dev/null @@ -1,74 +0,0 @@ -import React from 'react'; -import Transformation from './Transformation.jsx'; -import BlockPageView from '../../components/debug/BlockPageView.jsx'; -import ParseResult from '../ParseResult.jsx'; -import BlockPage from '../BlockPage.jsx'; - -export default class ToBlockSystem extends Transformation { - - constructor() { - super("To Block System"); - } - - createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars - return ; - } - - transform(parseResult:ParseResult) { - const blocks = []; - parseResult.content.forEach(page => { - var minDiff = 99; - var lastY = 0; - page.textItems.forEach(item => { - if (lastY > 0) { - const yDiff = lastY - item.y - item.height; - if (yDiff > 0) { - minDiff = Math.min(minDiff, yDiff); - } - } - lastY = item.y; - }); - - var text; - const rollup = (category) => { - if (text && text.length > 0) { - // console.debug("Push[" + blocks.length + "]: " + text); - blocks.push({ - category: category, - text: text - }); - } - text = null; - }; - - lastY = 0; - page.textItems.forEach(item => { - if (item.markdownElement) { - rollup("Block"); - text = item.markdownElement.transformText(item.text); - rollup(item.markdownElement.constructor.name); - } else if (!text) { - text = item.text; - } else { - const yDiff = lastY - item.y - item.height; - if (yDiff > minDiff + 2) { - rollup("Block"); - text = item.text; - } else { - text += '\n' + item.text; - } - } - lastY = item.y; - }); - rollup("Block") - }); - return new ParseResult({ - ...parseResult, - content: [new BlockPage({ - index: 0, - blocks: blocks - })], - }); - } - -} \ No newline at end of file