diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx
index 1af8dc5..b90c023 100644
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@@ -52,8 +52,26 @@ export default class TextItemTable extends React.Component {
{ textItem.type ? textItem.type.name : '' }
- { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
- { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
+ { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ?
+ Footnote-Link
+
: '' }
+ { textItem.parsedElements && textItem.parsedElements.containLinks ?
+ Link
+
: '' }
+ { textItem.lineFormat ?
+ { textItem.lineFormat.name }
+
: '' }
+ { textItem.unopenedFormat ?
+ Unopened
+ { ' ' + textItem.unopenedFormat.name }
+
: '' }
+ { textItem.parsedElements && textItem.parsedElements.inlineFormats > 0 ?
+ { textItem.parsedElements.inlineFormats + 'x Bold/Italic' }
+
: '' }
+ { textItem.unclosedFormat ?
+ Unclosed
+ { ' ' + textItem.unclosedFormat.name }
+
: '' }
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index d9462dd..7ecdb4d 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -8,12 +8,11 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
+import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
-// import DetectFormats from './transformations/DetectFormats.jsx'
-// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@@ -59,15 +58,14 @@ export default class AppState {
new VerticalToHorizontal(),
new PostprocessLines(),
new DetectTOC(),
- new DetectListItems(),
new DetectHeaders(),
+ new CompleteFormats(),
+ new DetectListItems(),
new GatherBlocks(),
new DetectCodeQuoteBlocks(),
new DetectListLevels(),
- // new DetectFormats(),
- // new HeadlineToUppercase(),
new ToTextBlocks(),
new ToMarkdown()];
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index 080fb02..dbb00af 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -18,12 +18,14 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks;
+ this.inlineFormats = options.inlineFormats || 0;
}
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks;
+ this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
}
}
\ No newline at end of file
diff --git a/src/javascript/models/StringFormat.jsx b/src/javascript/models/StringFormat.jsx
index be112bb..54a7041 100644
--- a/src/javascript/models/StringFormat.jsx
+++ b/src/javascript/models/StringFormat.jsx
@@ -3,4 +3,23 @@ import { Enum } from 'enumify';
export default class StringFormat extends Enum {
}
-StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE'])
\ No newline at end of file
+StringFormat.initEnum({
+ STANDARD: {
+ needFormat: false
+ },
+ BOLD: {
+ needFormat: true,
+ startSymbol: '**',
+ endSymbol: '**'
+ },
+ OBLIQUE: {
+ needFormat: true,
+ startSymbol: '_',
+ endSymbol: '_'
+ },
+ BOLD_OBLIQUE: {
+ needFormat: true,
+ startSymbol: '**_',
+ endSymbol: '_**'
+ }
+})
\ No newline at end of file
diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx
index fb1ab2c..d013cc1 100644
--- a/src/javascript/models/TextItem.jsx
+++ b/src/javascript/models/TextItem.jsx
@@ -13,6 +13,10 @@ export default class TextItem extends PageItem {
this.font = options.font;
this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent;
+
+ this.lineFormat = options.lineFormat;
+ this.unopenedFormat = options.unopenedFormat;
+ this.unclosedFormat = options.unclosedFormat;
}
}
diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx
index 68b0bd7..62344f2 100644
--- a/src/javascript/models/TextItemLineCompactor.jsx
+++ b/src/javascript/models/TextItemLineCompactor.jsx
@@ -7,10 +7,8 @@ import { sortByX } from '../textItemFunctions.jsx'
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemLineCompactor {
- constructor(options) {
- if (options) {
- this.transformEmphasis = options.transformEmphasis || true;
- }
+ constructor(fontToFormats) {
+ this.fontToFormats = fontToFormats;
}
// returns a CombineResult
@@ -22,8 +20,10 @@ export default class TextItemLineCompactor {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
- var combinedItem;
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
+ const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
+
+ var combinedItem;
if (resolvedLineItems.length == 1) {
combinedItem = resolvedLineItems[0];
} else {
@@ -51,9 +51,93 @@ export default class TextItemLineCompactor {
});
}
combinedItem.parsedElements = parsedElements;
+ combinedItem.lineFormat = lineFormat;
+ combinedItem.unopenedFormat = unopenedFormat;
+ combinedItem.unclosedFormat = unclosedFormat;
return combinedItem;
}
+ addFormats(resolvedLineItems, parsedElements) {
+ var inlineFormats = 0;
+ var openFormatType;
+ var openFormatItem;
+ var openFormatIndex;
+ var lastItem;
+
+ var lineFormat;
+ var unopenedFormat;
+ var unclosedFormat;
+
+ const addStartSymbol = () => {
+ resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
+ ...openFormatItem,
+ text: openFormatType.startSymbol + openFormatItem.text
+ }));
+ }
+ const addEndSymbol = (index) => {
+ resolvedLineItems.splice(index, 1, new TextItem({
+ ...lastItem,
+ text: lastItem.text + openFormatType.endSymbol
+ }));
+ }
+ const addCompleteSymbol = () => {
+ resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
+ ...openFormatItem,
+ text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
+ }));
+ }
+
+ const rollupOpenFormat = (endIndex) => {
+ const formatFromBeginningOfLine = openFormatIndex == 0;
+ const formatToEndOfLine = endIndex == resolvedLineItems.length - 1;
+ if (formatFromBeginningOfLine) {
+ if (formatToEndOfLine) {
+ lineFormat = openFormatType;
+ } else {
+ unopenedFormat = openFormatType;
+ addEndSymbol(endIndex);
+ }
+ } else {
+ if (formatToEndOfLine) {
+ unclosedFormat = openFormatType;
+ addStartSymbol();
+ } else {
+ inlineFormats++;
+ if (lastItem === openFormatItem) {
+ addCompleteSymbol();
+ } else {
+ addStartSymbol();
+ addEndSymbol();
+ }
+ }
+ }
+ };
+
+ resolvedLineItems.slice().forEach((item, i) => {
+ const formatType = this.fontToFormats.get(item.font);
+ if (openFormatType) {
+ if (formatType !== openFormatType) { //closin existing format
+ rollupOpenFormat(i - 1);
+ openFormatType = formatType.needFormat ? formatType : null;
+ openFormatItem = formatType.needFormat ? item : null;
+ openFormatIndex = formatType.needFormat ? i : null;
+ }
+ } else {
+ if (formatType.needFormat) {
+ openFormatType = formatType;
+ openFormatItem = item;
+ openFormatIndex = i;
+ }
+ }
+ lastItem = item;
+ });
+ if (openFormatType) {
+ rollupOpenFormat(resolvedLineItems.length - 1);
+ }
+ parsedElements.inlineFormats = inlineFormats;
+ return [lineFormat, unopenedFormat, unclosedFormat];
+ }
+
resolveSpecialElements(lineItems) {
const footnoteLinks = [];
const footnotes = [];
diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
index 4abe622..e28d19f 100644
--- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
+++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
@@ -57,11 +57,11 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
var format;
if (key == mostUsedFont) {
format = StringFormat.STANDARD;
- } else if (fontName.includes('bold') && fontName.includes('bold')) {
+ } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
format = StringFormat.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) {
format = StringFormat.BOLD;
- } else if (fontName.includes('oblique')) {
+ } else if (fontName.includes('oblique') || fontName.includes('italic')) {
format = StringFormat.OBLIQUE;
} else if (fontName === maxHeightFont) {
format = StringFormat.BOLD;
diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx
index 890e2ee..4b5b6f8 100644
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@@ -2,6 +2,7 @@ import React from 'react';
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
+import { ParsedElements } from '../../PageItem.jsx';
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
import ElementType from '../../ElementType.jsx';
@@ -16,13 +17,18 @@ export default class CompactLines extends ToTextItemTransformation {
}
transform(parseResult:ParseResult) {
- const {mostUsedDistance} = parseResult.globals;
+ const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = [];
const foundFootnoteLinks = [];
+ var inlineFormats = 0;
+ var lineFormats = 0;
+ var unopenedFormats = 0;
+ var unclosedFormats = 0;
+
const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance,
});
- const lineCompactor = new TextItemLineCompactor();
+ const lineCompactor = new TextItemLineCompactor(fontToFormats);
parseResult.pages.forEach(page => {
if (page.items.length > 0) {
@@ -32,6 +38,13 @@ export default class CompactLines extends ToTextItemTransformation {
var lineItem;
if (textItemsOfLine.length == 1) {
lineItem = textItemsOfLine[0];
+ const formatType = fontToFormats.get(lineItem.font);
+ if (formatType.needFormat) {
+ lineItem.lineFormat = formatType;
+ lineItem.parsedElements = new ParsedElements({
+ completeLineFormats: 1
+ });
+ }
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
@@ -50,7 +63,11 @@ export default class CompactLines extends ToTextItemTransformation {
const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
+ inlineFormats += lineItem.parsedElements.inlineFormats;
}
+ if (lineItem.lineFormat) lineFormats++;
+ if (lineItem.unopenedFormat) unopenedFormats++;
+ if (lineItem.unclosedFormat) unclosedFormats++;
lineItem.text = lineItem.text.trim();
newItems.push(lineItem);
});
@@ -62,9 +79,10 @@ export default class CompactLines extends ToTextItemTransformation {
return new ParseResult({
...parseResult,
messages: [
- // 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
- //'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
- // 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
+ 'Detected ' + lineFormats + ' line formats',
+ 'Detected ' + inlineFormats + ' inline formats',
+ 'Detected ' + unclosedFormats + ' opened un-closed formats',
+ 'Detected ' + unopenedFormats + ' un-opened closed formats',
Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }],
Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }],
]
diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx
new file mode 100644
index 0000000..c073f34
--- /dev/null
+++ b/src/javascript/models/transformations/textitem/CompleteFormats.jsx
@@ -0,0 +1,170 @@
+import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ParseResult from '../../ParseResult.jsx';
+import TextItem from '../../TextItem.jsx';
+import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
+
+//Complete unopened/unclosed bold/italic formats
+export default class CompleteFormats extends ToTextItemTransformation {
+
+ constructor() {
+ super("Complete Bold/Italics");
+ }
+
+ transform(parseResult:ParseResult) {
+ // remove line formats from headers
+ parseResult.pages.forEach(page => {
+ page.items.forEach(item => {
+ if (item.type && item.type.headline) {
+ if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
+ item.lineFormat = null;
+ item.unopenedFormat = null;
+ item.unclosedFormat = null;
+ item.annotation = UNCHANGED_ANNOTATION;
+ }
+ }
+ });
+ });
+
+ //close open formats
+ parseResult.pages.forEach(page => {
+ const itemStack = new ItemStack();
+ page.items.forEach(item => {
+ itemStack.consume(item);
+ });
+ page.items = itemStack.getResults();
+ });
+ return new ParseResult({
+ ...parseResult,
+ messages: []
+ });
+
+ }
+
+}
+
+class ItemStack {
+
+ constructor() {
+ this.openFormat;
+ this.openFormatItem = [];
+ this.resultItems = [];
+ }
+
+ cache(textItem, format) {
+ this.openFormat = format;
+ this.openFormatItem = textItem;
+ }
+
+ closeOpenFormat() {
+ if (this.openFormat) {
+ this.openFormatItem.annotation = REMOVED_ANNOTATION;
+ this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
+ this.clear();
+ }
+ }
+
+ clear() {
+ this.openFormat = null;
+ this.openFormatItem = null;
+ }
+
+ writeToResults(textItem) {
+ this.resultItems.push(textItem);
+ }
+
+
+ getResults() {
+ if (this.openFormat) {
+ this.closeOpenFormat();
+ }
+ return this.resultItems;
+ }
+
+ consume(item) {
+ const te = item.text;
+ var newItem;
+
+ const handleFreshUnopened = () => {
+ item.annotation = REMOVED_ANNOTATION;
+ newItem = textItemWithOpening(item, item.unopenedFormat);
+ }
+
+ const handleFreshLine = () => {
+ item.annotation = REMOVED_ANNOTATION;
+ newItem = textItemWithOpening(item, item.lineFormat);
+ this.cache(newItem, item.lineFormat);
+ }
+
+ const handleFreshUnclosed = () => {
+ if (newItem) {
+ this.cache(newItem, item.unclosedFormat);
+ newItem = null;
+ } else {
+ this.cache(item, item.unclosedFormat);
+ }
+ }
+
+ //flush open format if possible
+ if (this.openFormat) {
+ if (item.unopenedFormat) {
+ if (item.unopenedFormat === this.openFormat) {
+ //good, closing an opened
+ this.clear();
+ } else {
+ this.closeOpenFormat();
+ handleFreshUnopened();
+ }
+ }
+
+ if (item.lineFormat) {
+ if (item.lineFormat === this.openFormat) {
+ this.cache(item, item.lineFormat);
+ } else {
+ this.closeOpenFormat();
+ handleFreshLine();
+ }
+ }
+
+ if (item.unclosedFormat) {
+ this.closeOpenFormat();
+ handleFreshUnclosed();
+ }
+
+ if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
+ this.closeOpenFormat();
+ }
+
+ } else { // handle fresh items
+ if (item.unopenedFormat) {
+ handleFreshUnopened()
+ }
+ if (item.lineFormat) {
+ handleFreshLine();
+ }
+ if (item.unclosedFormat) {
+ handleFreshUnclosed();
+ }
+ }
+
+ this.writeToResults(item);
+ if (newItem) {
+ this.writeToResults(newItem);
+ }
+ }
+}
+
+function textItemWithOpening(textItem, format) {
+ return new TextItem({
+ ...textItem,
+ text: format.startSymbol + textItem.text,
+ annotation: ADDED_ANNOTATION
+ });
+}
+
+function textItemWithClosing(textItem, format) {
+ return new TextItem({
+ ...textItem,
+ text: textItem.text + format.endSymbol,
+ annotation: ADDED_ANNOTATION
+ });
+}
diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
index 328e77c..6d3d20d 100644
--- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx
+++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
@@ -3,6 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx';
+import { isListItem } from '../../../functions.jsx';
//Detect items starting with -, •, etc...
export default class DetectHeaders extends ToTextItemTransformation {
@@ -56,7 +57,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
var lastHeight;
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
- if (!textItem.type && textItem.height > mostUsedHeight) {
+ if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
heights.push(textItem.height);
}
@@ -69,7 +70,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
- if (!textItem.type && textItem.height == height) {
+ if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = headlineType;
|