diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index 12d58c8..165cca9 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -2,6 +2,7 @@ import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
+import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
@@ -23,6 +24,7 @@ export default class AppState {
this.transformations = [
new CalculateGlobalStats(),
new RoundCoordinates(),
+ new DetectFormats(),
new CombineSameY(),
new RemoveWhitespaces(),
new DetectFootnotes(),
diff --git a/src/javascript/models/transformations/CalculateGlobalStats.jsx b/src/javascript/models/transformations/CalculateGlobalStats.jsx
index 3ac0d5e..2ddb72c 100644
--- a/src/javascript/models/transformations/CalculateGlobalStats.jsx
+++ b/src/javascript/models/transformations/CalculateGlobalStats.jsx
@@ -17,6 +17,12 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
+
+ { 'Max height: ' + parseResult.globals.maxHeight + ' ' }
+
+
+ { 'Max height font: ' + parseResult.globals.maxHeightFont + ' ' }
+
{ 'Items per height: ' + JSON.stringify(parseResult.summary.heightToOccurrence) + ' ' }
@@ -31,17 +37,25 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
transform(parseResult:ParseResult) {
const heightToOccurrence = {};
const fontToOccurrence = {};
+ var maxHeight = 0;
+ var maxHeightFont;
parseResult.content.forEach(page => {
page.textItems.forEach(item => {
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
+ if (item.height > maxHeight) {
+ maxHeight = item.height;
+ maxHeightFont = item.font;
+ }
});
});
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
const mostUsedFont = getMostUsedKey(fontToOccurrence);
parseResult.globals = {
mostUsedHeight: mostUsedHeight,
- mostUsedFont: mostUsedFont
+ mostUsedFont: mostUsedFont,
+ maxHeight: maxHeight,
+ maxHeightFont: maxHeightFont
}
parseResult.summary = {
heightToOccurrence: heightToOccurrence,
diff --git a/src/javascript/models/transformations/DetectFormats.jsx b/src/javascript/models/transformations/DetectFormats.jsx
new file mode 100644
index 0000000..0520fba
--- /dev/null
+++ b/src/javascript/models/transformations/DetectFormats.jsx
@@ -0,0 +1,178 @@
+import React from 'react';
+import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
+import TextItem from '../TextItem.jsx';
+import ParseResult from '../ParseResult.jsx';
+import { REMOVED_ANNOTATION } from '../Annotation.jsx';
+import Annotation from '../Annotation.jsx';
+
+//Detect word/sentence formats like bold, italic,...
+export default class DetectFormats extends ToPdfViewTransformation {
+
+ constructor() {
+ super("Detect Bold/Italic");
+ }
+
+ createSummaryView(parseResult:ParseResult) {
+ return
+ Detected
+ { ' ' + parseResult.summary.foundFormats + ' ' } formats.
+
;
+ }
+
+
+ transform(parseResult:ParseResult) {
+ var foundFormats = 0;
+ const {mostUsedHeight, mostUsedFont, maxHeightFont} = parseResult.globals;
+ const symbols = {
+ bold: '**',
+ emphasis: '_'
+ }
+
+ const newContent = parseResult.content.map(page => {
+ const newTextItems = [];
+
+ //bundle items on same Y
+ const groupedItems = groupByFollowingY(page.textItems);
+ var lastItem;
+ var lastFormat;
+
+ const addNextItem = (item, format) => {
+ if (lastItem) {
+ if (lastFormat !== format) {
+ lastItem.text = appendSymbol(lastItem.text, symbols[lastFormat]);
+ if (lastItem.annotation) {
+ lastItem.annotation = newAnnotation(lastFormat);
+ } else {
+ lastItem.annotation = newAnnotation('End ' + lastFormat);
+ }
+ }
+ lastItem.height = mostUsedHeight;
+ newTextItems.push(lastItem);
+ }
+
+ if (format) {
+ if (lastFormat !== format) {
+ item.text = prependSymbol(item.text, symbols[format]);
+ item.annotation = newAnnotation('Start ' + format);
+ }
+ lastItem = item;
+ lastFormat = format;
+ } else {
+ newTextItems.push(item);
+ lastItem = null;
+ lastFormat = null;
+ }
+ };
+
+
+ groupedItems.forEach(itemGroup => {
+
+ //probably headline
+ const differentHeightsButSameFont = itemsHaveDifferentHeightsButSameFont(itemGroup);
+
+ itemGroup.forEach(item => {
+ const paragraphHeighOrSlightlyBigger = item.height == mostUsedHeight || item.height == mostUsedHeight + 1;
+ if (!differentHeightsButSameFont && paragraphHeighOrSlightlyBigger && item.font !== mostUsedFont) {
+ // item.annotation = REMOVED_ANNOTATION;
+
+ const format = item.font === maxHeightFont ? 'bold' : 'emphasis';
+ addNextItem(item, format);
+
+ //TODO test with womb compilation. _Th_, _ff_,... check font like SanSarif ?
+ //TODO don't touch 'eingerückte' Zeichen => detect early ?
+ //TODO (Maybe) could detect combined bold & emphasis like font=bold.font + emph.font !?
+ foundFormats++;
+ } else {
+ addNextItem(item);
+ }
+ });
+ });
+
+ return {
+ ...page,
+ textItems: newTextItems
+ };
+ });
+ return new ParseResult({
+ ...parseResult,
+ content: newContent,
+ summary: {
+ foundFormats: foundFormats
+ }
+ });
+ }
+
+ completeTransform(parseResult:ParseResult) {
+ parseResult.content.forEach(page => {
+ page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
+ page.textItems.forEach(textItem => textItem.annotation = null)
+ });
+ return parseResult;
+ }
+
+}
+
+function newAnnotation(name) {
+ return new Annotation({
+ category: name,
+ color: 'green'
+ });
+}
+
+//groups all following text items with the same Y together
+function groupByFollowingY(textItems) {
+ const yArrays = [];
+ var itemsWithSameY = [];
+ var lastItem;
+ textItems.forEach(item => {
+ if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
+ itemsWithSameY.push(item);
+ } else {
+ yArrays.push(itemsWithSameY);
+ itemsWithSameY = [item];
+ }
+ lastItem = item;
+ })
+ yArrays.push(itemsWithSameY);
+ return yArrays;
+}
+
+function itemsHaveDifferentHeightsButSameFont(itemGroup) {
+ var heights = new Set();
+ var fonts = new Set();
+ itemGroup.forEach(item => {
+ heights.add(item.height);
+ fonts.add(item.font);
+ });
+ return heights.size > 1 && fonts.size == 1;
+}
+
+//TODO move to stringFunctions
+
+function prependSymbol(text, symbol) {
+ if (text.charAt(0) == ' ') {
+ return ' ' + symbol + removeLeadingWhitespace(text);
+ }
+ return symbol + text;
+}
+
+function appendSymbol(text, symbol) {
+ if (text.charAt(text.length - 1) == ' ') {
+ return removeTrailingWhitespace(text) + symbol + ' ';
+ }
+ return text + symbol;
+}
+
+function removeLeadingWhitespace(text) {
+ while (text.charAt(0) == ' ') {
+ text = text.substring(1, text.length);
+ }
+ return text;
+}
+
+function removeTrailingWhitespace(text) {
+ while (text.charAt(text.length - 1) == ' ') {
+ text = text.substring(0, text.length - 1);
+ }
+ return text;
+}