From b7393fc806740fe7409bfeddc50688f574943656 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Fri, 17 Feb 2017 08:16:27 +0100 Subject: [PATCH] Detect bold and emphasis --- src/javascript/models/AppState.jsx | 2 + .../transformations/CalculateGlobalStats.jsx | 16 +- .../models/transformations/DetectFormats.jsx | 178 ++++++++++++++++++ 3 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 src/javascript/models/transformations/DetectFormats.jsx diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 12d58c8..165cca9 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -2,6 +2,7 @@ import { Enum } from 'enumify'; import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx'; import RoundCoordinates from './transformations/RoundCoordinates.jsx'; +import DetectFormats from './transformations/DetectFormats.jsx' import CombineSameY from './transformations/CombineSameY.jsx'; import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx' @@ -23,6 +24,7 @@ export default class AppState { this.transformations = [ new CalculateGlobalStats(), new RoundCoordinates(), + new DetectFormats(), new CombineSameY(), new RemoveWhitespaces(), new DetectFootnotes(), diff --git a/src/javascript/models/transformations/CalculateGlobalStats.jsx b/src/javascript/models/transformations/CalculateGlobalStats.jsx index 3ac0d5e..2ddb72c 100644 --- a/src/javascript/models/transformations/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/CalculateGlobalStats.jsx @@ -17,6 +17,12 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
  • { 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
  • +
  • + { 'Max height: ' + parseResult.globals.maxHeight + ' ' } +
  • +
  • + { 'Max height font: ' + parseResult.globals.maxHeightFont + ' ' } +

  • { 'Items per height: ' + JSON.stringify(parseResult.summary.heightToOccurrence) + ' ' } @@ -31,17 +37,25 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation { transform(parseResult:ParseResult) { const heightToOccurrence = {}; const fontToOccurrence = {}; + var maxHeight = 0; + var maxHeightFont; parseResult.content.forEach(page => { page.textItems.forEach(item => { heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1; fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1; + if (item.height > maxHeight) { + maxHeight = item.height; + maxHeightFont = item.font; + } }); }); const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence)); const mostUsedFont = getMostUsedKey(fontToOccurrence); parseResult.globals = { mostUsedHeight: mostUsedHeight, - mostUsedFont: mostUsedFont + mostUsedFont: mostUsedFont, + maxHeight: maxHeight, + maxHeightFont: maxHeightFont } parseResult.summary = { heightToOccurrence: heightToOccurrence, diff --git a/src/javascript/models/transformations/DetectFormats.jsx b/src/javascript/models/transformations/DetectFormats.jsx new file mode 100644 index 0000000..0520fba --- /dev/null +++ b/src/javascript/models/transformations/DetectFormats.jsx @@ -0,0 +1,178 @@ +import React from 'react'; +import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; +import TextItem from '../TextItem.jsx'; +import ParseResult from '../ParseResult.jsx'; +import { REMOVED_ANNOTATION } from '../Annotation.jsx'; +import Annotation from '../Annotation.jsx'; + +//Detect word/sentence formats like bold, italic,... +export default class DetectFormats extends ToPdfViewTransformation { + + constructor() { + super("Detect Bold/Italic"); + } + + createSummaryView(parseResult:ParseResult) { + return
    + Detected + { ' ' + parseResult.summary.foundFormats + ' ' } formats. +
    ; + } + + + transform(parseResult:ParseResult) { + var foundFormats = 0; + const {mostUsedHeight, mostUsedFont, maxHeightFont} = parseResult.globals; + const symbols = { + bold: '**', + emphasis: '_' + } + + const newContent = parseResult.content.map(page => { + const newTextItems = []; + + //bundle items on same Y + const groupedItems = groupByFollowingY(page.textItems); + var lastItem; + var lastFormat; + + const addNextItem = (item, format) => { + if (lastItem) { + if (lastFormat !== format) { + lastItem.text = appendSymbol(lastItem.text, symbols[lastFormat]); + if (lastItem.annotation) { + lastItem.annotation = newAnnotation(lastFormat); + } else { + lastItem.annotation = newAnnotation('End ' + lastFormat); + } + } + lastItem.height = mostUsedHeight; + newTextItems.push(lastItem); + } + + if (format) { + if (lastFormat !== format) { + item.text = prependSymbol(item.text, symbols[format]); + item.annotation = newAnnotation('Start ' + format); + } + lastItem = item; + lastFormat = format; + } else { + newTextItems.push(item); + lastItem = null; + lastFormat = null; + } + }; + + + groupedItems.forEach(itemGroup => { + + //probably headline + const differentHeightsButSameFont = itemsHaveDifferentHeightsButSameFont(itemGroup); + + itemGroup.forEach(item => { + const paragraphHeighOrSlightlyBigger = item.height == mostUsedHeight || item.height == mostUsedHeight + 1; + if (!differentHeightsButSameFont && paragraphHeighOrSlightlyBigger && item.font !== mostUsedFont) { + // item.annotation = REMOVED_ANNOTATION; + + const format = item.font === maxHeightFont ? 'bold' : 'emphasis'; + addNextItem(item, format); + + //TODO test with womb compilation. _Th_, _ff_,... check font like SanSarif ? + //TODO don't touch 'eingerückte' Zeichen => detect early ? + //TODO (Maybe) could detect combined bold & emphasis like font=bold.font + emph.font !? + foundFormats++; + } else { + addNextItem(item); + } + }); + }); + + return { + ...page, + textItems: newTextItems + }; + }); + return new ParseResult({ + ...parseResult, + content: newContent, + summary: { + foundFormats: foundFormats + } + }); + } + + completeTransform(parseResult:ParseResult) { + parseResult.content.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return parseResult; + } + +} + +function newAnnotation(name) { + return new Annotation({ + category: name, + color: 'green' + }); +} + +//groups all following text items with the same Y together +function groupByFollowingY(textItems) { + const yArrays = []; + var itemsWithSameY = []; + var lastItem; + textItems.forEach(item => { + if (itemsWithSameY.length == 0 || item.y == lastItem.y) { + itemsWithSameY.push(item); + } else { + yArrays.push(itemsWithSameY); + itemsWithSameY = [item]; + } + lastItem = item; + }) + yArrays.push(itemsWithSameY); + return yArrays; +} + +function itemsHaveDifferentHeightsButSameFont(itemGroup) { + var heights = new Set(); + var fonts = new Set(); + itemGroup.forEach(item => { + heights.add(item.height); + fonts.add(item.font); + }); + return heights.size > 1 && fonts.size == 1; +} + +//TODO move to stringFunctions + +function prependSymbol(text, symbol) { + if (text.charAt(0) == ' ') { + return ' ' + symbol + removeLeadingWhitespace(text); + } + return symbol + text; +} + +function appendSymbol(text, symbol) { + if (text.charAt(text.length - 1) == ' ') { + return removeTrailingWhitespace(text) + symbol + ' '; + } + return text + symbol; +} + +function removeLeadingWhitespace(text) { + while (text.charAt(0) == ' ') { + text = text.substring(1, text.length); + } + return text; +} + +function removeTrailingWhitespace(text) { + while (text.charAt(text.length - 1) == ' ') { + text = text.substring(0, text.length - 1); + } + return text; +}