diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx index 1af8dc5..b90c023 100644 --- a/src/javascript/components/debug/TextItemTable.jsx +++ b/src/javascript/components/debug/TextItemTable.jsx @@ -52,8 +52,26 @@ export default class TextItemTable extends React.Component { { textItem.type ? textItem.type.name : '' }
- { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } - { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' } + { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ?
+ Footnote-Link +
: '' } + { textItem.parsedElements && textItem.parsedElements.containLinks ?
+ Link +
: '' } + { textItem.lineFormat ?
+ { textItem.lineFormat.name } +
: '' } + { textItem.unopenedFormat ?
+ Unopened + { ' ' + textItem.unopenedFormat.name } +
: '' } + { textItem.parsedElements && textItem.parsedElements.inlineFormats > 0 ?
+ { textItem.parsedElements.inlineFormats + 'x Bold/Italic' } +
: '' } + { textItem.unclosedFormat ?
+ Unclosed + { ' ' + textItem.unclosedFormat.name } +
: '' }
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index d9462dd..7ecdb4d 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -8,12 +8,11 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx'; import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' +import CompleteFormats from './transformations/textitem/CompleteFormats.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx' -// import DetectFormats from './transformations/DetectFormats.jsx' -// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToMarkdown from './transformations/ToMarkdown.jsx' @@ -59,15 +58,14 @@ export default class AppState { new VerticalToHorizontal(), new PostprocessLines(), new DetectTOC(), - new DetectListItems(), new DetectHeaders(), + new CompleteFormats(), + new DetectListItems(), new GatherBlocks(), new DetectCodeQuoteBlocks(), new DetectListLevels(), - // new DetectFormats(), - // new HeadlineToUppercase(), new ToTextBlocks(), new ToMarkdown()]; diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx index 080fb02..dbb00af 100644 --- a/src/javascript/models/PageItem.jsx +++ b/src/javascript/models/PageItem.jsx @@ -18,12 +18,14 @@ export class ParsedElements { this.footnoteLinks = options.footnoteLinks || []; this.footnotes = options.footnotes || []; this.containLinks = options.containLinks; + this.inlineFormats = options.inlineFormats || 0; } add(parsedElements) { this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.containLinks = this.containLinks || parsedElements.containLinks; + this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats; } } \ No newline at end of file diff --git a/src/javascript/models/StringFormat.jsx b/src/javascript/models/StringFormat.jsx index be112bb..54a7041 100644 --- a/src/javascript/models/StringFormat.jsx +++ b/src/javascript/models/StringFormat.jsx @@ -3,4 +3,23 @@ import { Enum } from 'enumify'; export default class StringFormat extends Enum { } -StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']) \ No newline at end of file +StringFormat.initEnum({ + STANDARD: { + needFormat: false + }, + BOLD: { + needFormat: true, + startSymbol: '**', + endSymbol: '**' + }, + OBLIQUE: { + needFormat: true, + startSymbol: '_', + endSymbol: '_' + }, + BOLD_OBLIQUE: { + needFormat: true, + startSymbol: '**_', + endSymbol: '_**' + } +}) \ No newline at end of file diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx index fb1ab2c..d013cc1 100644 --- a/src/javascript/models/TextItem.jsx +++ b/src/javascript/models/TextItem.jsx @@ -13,6 +13,10 @@ export default class TextItem extends PageItem { this.font = options.font; this.fontAscent = options.fontAscent; this.fontDescent = options.fontDescent; + + this.lineFormat = options.lineFormat; + this.unopenedFormat = options.unopenedFormat; + this.unclosedFormat = options.unclosedFormat; } } diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx index 68b0bd7..62344f2 100644 --- a/src/javascript/models/TextItemLineCompactor.jsx +++ b/src/javascript/models/TextItemLineCompactor.jsx @@ -7,10 +7,8 @@ import { sortByX } from '../textItemFunctions.jsx' //'whitespace removal', bold/emphasis annotation, link-detection, etc.. export default class TextItemLineCompactor { - constructor(options) { - if (options) { - this.transformEmphasis = options.transformEmphasis || true; - } + constructor(fontToFormats) { + this.fontToFormats = fontToFormats; } // returns a CombineResult @@ -22,8 +20,10 @@ export default class TextItemLineCompactor { // we can't trust order of occurence, esp. footnoteLinks like to come last sortByX(lineItems); - var combinedItem; const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems); + const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements); + + var combinedItem; if (resolvedLineItems.length == 1) { combinedItem = resolvedLineItems[0]; } else { @@ -51,9 +51,93 @@ export default class TextItemLineCompactor { }); } combinedItem.parsedElements = parsedElements; + combinedItem.lineFormat = lineFormat; + combinedItem.unopenedFormat = unopenedFormat; + combinedItem.unclosedFormat = unclosedFormat; return combinedItem; } + addFormats(resolvedLineItems, parsedElements) { + var inlineFormats = 0; + var openFormatType; + var openFormatItem; + var openFormatIndex; + var lastItem; + + var lineFormat; + var unopenedFormat; + var unclosedFormat; + + const addStartSymbol = () => { + resolvedLineItems.splice(openFormatIndex, 1, new TextItem({ + ...openFormatItem, + text: openFormatType.startSymbol + openFormatItem.text + })); + } + const addEndSymbol = (index) => { + resolvedLineItems.splice(index, 1, new TextItem({ + ...lastItem, + text: lastItem.text + openFormatType.endSymbol + })); + } + const addCompleteSymbol = () => { + resolvedLineItems.splice(openFormatIndex, 1, new TextItem({ + ...openFormatItem, + text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol + })); + } + + const rollupOpenFormat = (endIndex) => { + const formatFromBeginningOfLine = openFormatIndex == 0; + const formatToEndOfLine = endIndex == resolvedLineItems.length - 1; + if (formatFromBeginningOfLine) { + if (formatToEndOfLine) { + lineFormat = openFormatType; + } else { + unopenedFormat = openFormatType; + addEndSymbol(endIndex); + } + } else { + if (formatToEndOfLine) { + unclosedFormat = openFormatType; + addStartSymbol(); + } else { + inlineFormats++; + if (lastItem === openFormatItem) { + addCompleteSymbol(); + } else { + addStartSymbol(); + addEndSymbol(); + } + } + } + }; + + resolvedLineItems.slice().forEach((item, i) => { + const formatType = this.fontToFormats.get(item.font); + if (openFormatType) { + if (formatType !== openFormatType) { //closin existing format + rollupOpenFormat(i - 1); + openFormatType = formatType.needFormat ? formatType : null; + openFormatItem = formatType.needFormat ? item : null; + openFormatIndex = formatType.needFormat ? i : null; + } + } else { + if (formatType.needFormat) { + openFormatType = formatType; + openFormatItem = item; + openFormatIndex = i; + } + } + lastItem = item; + }); + if (openFormatType) { + rollupOpenFormat(resolvedLineItems.length - 1); + } + parsedElements.inlineFormats = inlineFormats; + return [lineFormat, unopenedFormat, unclosedFormat]; + } + resolveSpecialElements(lineItems) { const footnoteLinks = []; const footnotes = []; diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx index 4abe622..e28d19f 100644 --- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx @@ -57,11 +57,11 @@ export default class CalculateGlobalStats extends ToTextItemTransformation { var format; if (key == mostUsedFont) { format = StringFormat.STANDARD; - } else if (fontName.includes('bold') && fontName.includes('bold')) { + } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) { format = StringFormat.BOLD_OBLIQUE; } else if (fontName.includes('bold')) { format = StringFormat.BOLD; - } else if (fontName.includes('oblique')) { + } else if (fontName.includes('oblique') || fontName.includes('italic')) { format = StringFormat.OBLIQUE; } else if (fontName === maxHeightFont) { format = StringFormat.BOLD; diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx index 890e2ee..4b5b6f8 100644 --- a/src/javascript/models/transformations/textitem/CompactLines.jsx +++ b/src/javascript/models/transformations/textitem/CompactLines.jsx @@ -2,6 +2,7 @@ import React from 'react'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; +import { ParsedElements } from '../../PageItem.jsx'; import TextItemLineGrouper from '../../TextItemLineGrouper.jsx'; import TextItemLineCompactor from '../../TextItemLineCompactor.jsx'; import ElementType from '../../ElementType.jsx'; @@ -16,13 +17,18 @@ export default class CompactLines extends ToTextItemTransformation { } transform(parseResult:ParseResult) { - const {mostUsedDistance} = parseResult.globals; + const {mostUsedDistance, fontToFormats} = parseResult.globals; const foundFootnotes = []; const foundFootnoteLinks = []; + var inlineFormats = 0; + var lineFormats = 0; + var unopenedFormats = 0; + var unclosedFormats = 0; + const lineGrouper = new TextItemLineGrouper({ mostUsedDistance: mostUsedDistance, }); - const lineCompactor = new TextItemLineCompactor(); + const lineCompactor = new TextItemLineCompactor(fontToFormats); parseResult.pages.forEach(page => { if (page.items.length > 0) { @@ -32,6 +38,13 @@ export default class CompactLines extends ToTextItemTransformation { var lineItem; if (textItemsOfLine.length == 1) { lineItem = textItemsOfLine[0]; + const formatType = fontToFormats.get(lineItem.font); + if (formatType.needFormat) { + lineItem.lineFormat = formatType; + lineItem.parsedElements = new ParsedElements({ + completeLineFormats: 1 + }); + } } else { textItemsOfLine.forEach(item => { item.annotation = REMOVED_ANNOTATION; @@ -50,7 +63,11 @@ export default class CompactLines extends ToTextItemTransformation { const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },); foundFootnotes.push.apply(foundFootnotes, footnotes); } + inlineFormats += lineItem.parsedElements.inlineFormats; } + if (lineItem.lineFormat) lineFormats++; + if (lineItem.unopenedFormat) unopenedFormats++; + if (lineItem.unclosedFormat) unclosedFormats++; lineItem.text = lineItem.text.trim(); newItems.push(lineItem); }); @@ -62,9 +79,10 @@ export default class CompactLines extends ToTextItemTransformation { return new ParseResult({ ...parseResult, messages: [ - // 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']', - //'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']', - // 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']', + 'Detected ' + lineFormats + ' line formats', + 'Detected ' + inlineFormats + ' inline formats', + 'Detected ' + unclosedFormats + ' opened un-closed formats', + 'Detected ' + unopenedFormats + ' un-opened closed formats', Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }], Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }], ] diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx new file mode 100644 index 0000000..c073f34 --- /dev/null +++ b/src/javascript/models/transformations/textitem/CompleteFormats.jsx @@ -0,0 +1,170 @@ +import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ParseResult from '../../ParseResult.jsx'; +import TextItem from '../../TextItem.jsx'; +import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx'; + +//Complete unopened/unclosed bold/italic formats +export default class CompleteFormats extends ToTextItemTransformation { + + constructor() { + super("Complete Bold/Italics"); + } + + transform(parseResult:ParseResult) { + // remove line formats from headers + parseResult.pages.forEach(page => { + page.items.forEach(item => { + if (item.type && item.type.headline) { + if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) { + item.lineFormat = null; + item.unopenedFormat = null; + item.unclosedFormat = null; + item.annotation = UNCHANGED_ANNOTATION; + } + } + }); + }); + + //close open formats + parseResult.pages.forEach(page => { + const itemStack = new ItemStack(); + page.items.forEach(item => { + itemStack.consume(item); + }); + page.items = itemStack.getResults(); + }); + return new ParseResult({ + ...parseResult, + messages: [] + }); + + } + +} + +class ItemStack { + + constructor() { + this.openFormat; + this.openFormatItem = []; + this.resultItems = []; + } + + cache(textItem, format) { + this.openFormat = format; + this.openFormatItem = textItem; + } + + closeOpenFormat() { + if (this.openFormat) { + this.openFormatItem.annotation = REMOVED_ANNOTATION; + this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat)); + this.clear(); + } + } + + clear() { + this.openFormat = null; + this.openFormatItem = null; + } + + writeToResults(textItem) { + this.resultItems.push(textItem); + } + + + getResults() { + if (this.openFormat) { + this.closeOpenFormat(); + } + return this.resultItems; + } + + consume(item) { + const te = item.text; + var newItem; + + const handleFreshUnopened = () => { + item.annotation = REMOVED_ANNOTATION; + newItem = textItemWithOpening(item, item.unopenedFormat); + } + + const handleFreshLine = () => { + item.annotation = REMOVED_ANNOTATION; + newItem = textItemWithOpening(item, item.lineFormat); + this.cache(newItem, item.lineFormat); + } + + const handleFreshUnclosed = () => { + if (newItem) { + this.cache(newItem, item.unclosedFormat); + newItem = null; + } else { + this.cache(item, item.unclosedFormat); + } + } + + //flush open format if possible + if (this.openFormat) { + if (item.unopenedFormat) { + if (item.unopenedFormat === this.openFormat) { + //good, closing an opened + this.clear(); + } else { + this.closeOpenFormat(); + handleFreshUnopened(); + } + } + + if (item.lineFormat) { + if (item.lineFormat === this.openFormat) { + this.cache(item, item.lineFormat); + } else { + this.closeOpenFormat(); + handleFreshLine(); + } + } + + if (item.unclosedFormat) { + this.closeOpenFormat(); + handleFreshUnclosed(); + } + + if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) { + this.closeOpenFormat(); + } + + } else { // handle fresh items + if (item.unopenedFormat) { + handleFreshUnopened() + } + if (item.lineFormat) { + handleFreshLine(); + } + if (item.unclosedFormat) { + handleFreshUnclosed(); + } + } + + this.writeToResults(item); + if (newItem) { + this.writeToResults(newItem); + } + } +} + +function textItemWithOpening(textItem, format) { + return new TextItem({ + ...textItem, + text: format.startSymbol + textItem.text, + annotation: ADDED_ANNOTATION + }); +} + +function textItemWithClosing(textItem, format) { + return new TextItem({ + ...textItem, + text: textItem.text + format.endSymbol, + annotation: ADDED_ANNOTATION + }); +} diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index 328e77c..6d3d20d 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -3,6 +3,7 @@ import ParseResult from '../../ParseResult.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx'; +import { isListItem } from '../../../functions.jsx'; //Detect items starting with -, •, etc... export default class DetectHeaders extends ToTextItemTransformation { @@ -56,7 +57,7 @@ export default class DetectHeaders extends ToTextItemTransformation { var lastHeight; parseResult.pages.forEach(page => { page.items.forEach(textItem => { - if (!textItem.type && textItem.height > mostUsedHeight) { + if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) { if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) { heights.push(textItem.height); } @@ -69,7 +70,7 @@ export default class DetectHeaders extends ToTextItemTransformation { const headlineType = headlineByLevel(2 + i); parseResult.pages.forEach(page => { page.items.forEach(textItem => { - if (!textItem.type && textItem.height == height) { + if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) { detectedHeaders++; textItem.annotation = DETECTED_ANNOTATION; textItem.type = headlineType;