From 9dbc57b4fe2478bed1b0457ca7f8f07f0663039b Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Tue, 28 Mar 2017 06:11:42 +0200 Subject: [PATCH] [DONE] format words properly --- src/javascript/models/AppState.jsx | 2 - src/javascript/models/ElementType.jsx | 34 ++-- src/javascript/models/LineConverter.jsx | 85 +++++---- src/javascript/models/PageItem.jsx | 2 - src/javascript/models/StringFormat.jsx | 25 --- src/javascript/models/markdown/WordType.jsx | 92 +++++++++- .../textitem/CalculateGlobalStats.jsx | 1 - .../transformations/textitem/CompactLines.jsx | 5 + .../textitem/CompleteFormats.jsx | 171 ------------------ 9 files changed, 159 insertions(+), 258 deletions(-) delete mode 100644 src/javascript/models/StringFormat.jsx delete mode 100644 src/javascript/models/transformations/textitem/CompleteFormats.jsx diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 14e7be4..e8b6b16 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -9,7 +9,6 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx'; import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' -import CompleteFormats from './transformations/textitem/CompleteFormats.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' @@ -60,7 +59,6 @@ export default class AppState { // new PostprocessLines(), new DetectTOC(), new DetectHeaders(), - // new CompleteFormats(), new DetectListItems(), new GatherBlocks(), diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx index 6463a04..b29dddc 100644 --- a/src/javascript/models/ElementType.jsx +++ b/src/javascript/models/ElementType.jsx @@ -1,5 +1,5 @@ import { Enum } from 'enumify'; -import LineItem from './LineItem.jsx'; +import { linesToText } from './markdown/WordType.jsx'; import LineItemBlock from './LineItemBlock.jsx'; // An Markdown element @@ -13,73 +13,73 @@ ElementType.initEnum({ headline: true, headlineLevel: 1, toText(block:LineItemBlock) { - return '# ' + concatLineItems(block.items); + return '# ' + linesToText(block.items, true); } }, H2: { headline: true, headlineLevel: 2, toText(block:LineItemBlock) { - return '## ' + concatLineItems(block.items); + return '## ' + linesToText(block.items, true); } }, H3: { headline: true, headlineLevel: 3, toText(block:LineItemBlock) { - return '### ' + concatLineItems(block.items); + return '### ' + linesToText(block.items, true); } }, H4: { headline: true, headlineLevel: 4, toText(block:LineItemBlock) { - return '#### ' + concatLineItems(block.items); + return '#### ' + linesToText(block.items, true); } }, H5: { headline: true, headlineLevel: 5, toText(block:LineItemBlock) { - return '##### ' + concatLineItems(block.items); + return '##### ' + linesToText(block.items, true); } }, H6: { headline: true, headlineLevel: 6, toText(block:LineItemBlock) { - return '###### ' + concatLineItems(block.items); + return '###### ' + linesToText(block.items, true); } }, TOC: { mergeToBlock: true, toText(block:LineItemBlock) { - return concatLineItems(block.items); + return linesToText(block.items, true); } }, FOOTNOTES: { mergeToBlock: true, mergeFollowingNonTypedItems: true, toText(block:LineItemBlock) { - return concatLineItems(block.items); + return linesToText(block.items, false); } }, CODE: { mergeToBlock: true, toText(block:LineItemBlock) { - return '```\n' + concatLineItems(block.items) + '```' + return '```\n' + linesToText(block.items, true) + '```' } }, LIST: { mergeToBlock: true, mergeFollowingNonTypedItemsWithSmallDistance: true, toText(block:LineItemBlock) { - return concatLineItems(block.items); + return linesToText(block.items, false); } }, PARAGRAPH: { toText(block:LineItemBlock) { - return concatLineItems(block.items); + return linesToText(block.items, false); } } }); @@ -90,19 +90,11 @@ export function isHeadline(elementType: ElementType) { export function blockToText(block: LineItemBlock) { if (!block.type) { - return concatLineItems(block.items); + return linesToText(block.items, false); } return block.type.toText(block); } -function concatLineItems(lineItems: LineItem[]) { - var text = ''; - lineItems.forEach(item => { - text += item.text() + '\n'; - }); - return text; -} - export function headlineByLevel(level) { if (level == 1) { return ElementType.H1; diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx index 5ea47ee..66634a8 100644 --- a/src/javascript/models/LineConverter.jsx +++ b/src/javascript/models/LineConverter.jsx @@ -40,7 +40,8 @@ export default class LineConverter { words: words, parsedElements: new ParsedElements({ footnoteLinks: wordStream.footnoteLinks, - footnotes: wordStream.footnotes + footnotes: wordStream.footnotes, + containLinks: wordStream.containLinks }) }); @@ -48,34 +49,6 @@ export default class LineConverter { } -function itemsToWords(items, format) { - const combinedText = combineText(items); - // const combinedText = items.map(textItem => textItem.text).join(''); - const words = combinedText.split(' '); - return words.filter(w => w.trim().length > 0).map(word => { - return new Word({ - string: word, - type: format - }); - }); -} - -function combineText(textItems) { - var text = ''; - var lastItem; - textItems.forEach(textItem => { - if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) { - const xDistance = textItem.x - lastItem.x - lastItem.width; - if (xDistance > 5) { - text += ' '; - } - } - text += textItem.text; - lastItem = textItem; - }); - return text; -} - class WordDetectionStream extends StashingStream { constructor(fontToFormats) { @@ -83,6 +56,8 @@ class WordDetectionStream extends StashingStream { this.fontToFormats = fontToFormats; this.footnoteLinks = []; this.footnotes = []; + this.formattedWords = 0 + this.containLinks = false; this.firstY; this.stashedNumber = false; @@ -113,21 +88,17 @@ class WordDetectionStream extends StashingStream { doFlushStash(stash, results) { if (this.stashedNumber) { - const joinedNumber = stash.map(item => item.text).join(''); + const joinedNumber = stash.map(item => item.text).join('').trim(); if (stash[0].y > this.firstY) { // footnote link results.push(new Word({ string: `${joinedNumber}`, type: WordType.FOOTNOTE_LINK - //TODO format to - //^ - //`[${joinedNumber}](#${joinedNumber})` })); this.footnoteLinks.push(parseInt(joinedNumber)); } else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote results.push(new Word({ string: `${joinedNumber}`, type: WordType.FOOTNOTE - //TODO format to (^${ joinedNumber}): })); this.footnotes.push(joinedNumber); } else { @@ -140,6 +111,50 @@ class WordDetectionStream extends StashingStream { copyStashItemsAsText(stash, results) { const format = this.fontToFormats.get(stash[0].font); - results.push(...itemsToWords(stash, format)); + results.push(...this.itemsToWords(stash, format)); + } + + itemsToWords(items, format) { + const combinedText = combineText(items); + // const combinedText = items.map(textItem => textItem.text).join(''); + const words = combinedText.split(' '); + return words.filter(w => w.trim().length > 0).map(word => { + if (word.startsWith('http:')) { + this.containLinks = true; + return new Word({ + string: word, + type: WordType.LINK + }); + } else if (word.startsWith('www.')) { + this.containLinks = true; + word = `http://${word}` + return new Word({ + string: word, + type: WordType.LINK + }); + } + + return new Word({ + string: word, + type: format + }); + }); } } + + +function combineText(textItems) { + var text = ''; + var lastItem; + textItems.forEach(textItem => { + if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) { + const xDistance = textItem.x - lastItem.x - lastItem.width; + if (xDistance > 5) { + text += ' '; + } + } + text += textItem.text; + lastItem = textItem; + }); + return text; +} diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx index 72f465d..fe9b61b 100644 --- a/src/javascript/models/PageItem.jsx +++ b/src/javascript/models/PageItem.jsx @@ -18,14 +18,12 @@ export class ParsedElements { this.footnoteLinks = options.footnoteLinks || []; this.footnotes = options.footnotes || []; this.containLinks = options.containLinks; - this.inlineFormats = options.inlineFormats || 0; } add(parsedElements) { this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.containLinks = this.containLinks || parsedElements.containLinks; - this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats; } } \ No newline at end of file diff --git a/src/javascript/models/StringFormat.jsx b/src/javascript/models/StringFormat.jsx deleted file mode 100644 index 3518c2f..0000000 --- a/src/javascript/models/StringFormat.jsx +++ /dev/null @@ -1,25 +0,0 @@ -import { Enum } from 'enumify'; - -export default class StringFormat extends Enum { -} - -StringFormat.initEnum({ - STANDARD: { - needFormat: false - }, - BOLD: { - needFormat: true, - startSymbol: ' **', - endSymbol: '** ' - }, - OBLIQUE: { - needFormat: true, - startSymbol: ' _', - endSymbol: '_ ' - }, - BOLD_OBLIQUE: { - needFormat: true, - startSymbol: ' **_', - endSymbol: '_** ' - } -}) \ No newline at end of file diff --git a/src/javascript/models/markdown/WordType.jsx b/src/javascript/models/markdown/WordType.jsx index 64a531d..7964375 100644 --- a/src/javascript/models/markdown/WordType.jsx +++ b/src/javascript/models/markdown/WordType.jsx @@ -2,6 +2,96 @@ import { Enum } from 'enumify'; // An Markdown word element export default class WordType extends Enum { + } -WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']); \ No newline at end of file +WordType.initEnum({ + LINK: { + toText(string) { + return `[${string}](${string})` + } + }, + FOOTNOTE_LINK: { + attachWithoutWhitespace: true, + plainTextFormat: true, + toText(string) { + return `^${string}` + // return `[${string}](#${string})`; + } + }, + FOOTNOTE: { + toText(string) { + return `(^${string})` + } + }, + BOLD: { + format: true, + startSymbol: '**', + endSymbol: '**', + }, + OBLIQUE: { + format: true, + startSymbol: '_', + endSymbol: '_', + }, + BOLD_OBLIQUE: { + format: true, + startSymbol: '**_', + endSymbol: '_**', + } +}); + +export function linesToText(lineItems, disableInlineFormats) { + var text = ''; + var openFormat; + + const closeFormat = () => { + text += openFormat.endSymbol; + openFormat = null; + }; + + lineItems.forEach((line, lineIndex) => { + line.words.forEach((word, i) => { + const wordType = word.type; + if (openFormat && (!wordType || wordType !== openFormat)) { + closeFormat(); + } + + if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) { + text += ' '; + } + if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) { + if (wordType.format) { + if (!openFormat) { + openFormat = wordType; + text += openFormat.startSymbol; + } + text += word.string; + } else { + text += wordType.toText(word.string); + } + } else { + text += word.string; + } + }); + if (openFormat && (lineIndex == lineItems.length - 1 || firstFormat(lineItems[lineIndex + 1]) !== openFormat)) { + closeFormat(); + } + text += '\n'; + }); + return text; +} + +function firstFormat(lineItem) { + if (lineItem.words.length == 0) { + return null; + } + return lineItem.words[0].type; +} + +function isPunctationCharacter(string) { + if (string.length != 1) { + return false; + } + return string[0] === '.' || string[0] === '!' || string[0] === '?'; +} \ No newline at end of file diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx index 7a06d00..61a30ed 100644 --- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx @@ -1,7 +1,6 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import WordType from '../../markdown/WordType.jsx'; -// import StringFormat from '../../StringFormat.jsx'; export default class CalculateGlobalStats extends ToTextItemTransformation { diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx index 12446bb..2c27aea 100644 --- a/src/javascript/models/transformations/textitem/CompactLines.jsx +++ b/src/javascript/models/transformations/textitem/CompactLines.jsx @@ -20,6 +20,7 @@ export default class CompactLines extends ToLineItemTransformation { const {mostUsedDistance, fontToFormats} = parseResult.globals; const foundFootnotes = []; const foundFootnoteLinks = []; + var linkCount = 0; var formattedWords = 0; const lineGrouper = new TextItemLineGrouper({ @@ -47,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation { } lineItems.push(lineItem); + if (lineItem.parsedElements.containLinks > 0) { + linkCount++; + } if (lineItem.parsedElements.footnoteLinks.length > 0) { const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); @@ -66,6 +70,7 @@ export default class CompactLines extends ToLineItemTransformation { ...parseResult, messages: [ 'Detected ' + formattedWords + ' formatted words', + 'Found ' + linkCount + ' links', Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }], Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }], ] diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx deleted file mode 100644 index e6ac7f8..0000000 --- a/src/javascript/models/transformations/textitem/CompleteFormats.jsx +++ /dev/null @@ -1,171 +0,0 @@ -import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; -import ParseResult from '../../ParseResult.jsx'; -import TextItem from '../../TextItem.jsx'; -import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx'; - -//Complete unopened/unclosed bold/italic formats -export default class CompleteFormats extends ToTextItemTransformation { - - //TODO move to block and ignore quotes - - constructor() { - super("Complete Bold/Italics"); - } - - transform(parseResult:ParseResult) { - // remove line formats from headers - parseResult.pages.forEach(page => { - page.items.forEach(item => { - if (item.type && item.type.headline) { - if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) { - item.lineFormat = null; - item.unopenedFormat = null; - item.unclosedFormat = null; - item.annotation = UNCHANGED_ANNOTATION; - } - } - }); - }); - - //close open formats - parseResult.pages.forEach(page => { - const itemStack = new ItemStack(); - page.items.forEach(item => { - itemStack.consume(item); - }); - page.items = itemStack.getResults(); - }); - return new ParseResult({ - ...parseResult, - messages: [] - }); - - } - -} - -class ItemStack { - - constructor() { - this.openFormat; - this.openFormatItem = []; - this.resultItems = []; - } - - cache(textItem, format) { - this.openFormat = format; - this.openFormatItem = textItem; - } - - closeOpenFormat() { - if (this.openFormat) { - this.openFormatItem.annotation = REMOVED_ANNOTATION; - this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat)); - this.clear(); - } - } - - clear() { - this.openFormat = null; - this.openFormatItem = null; - } - - writeToResults(textItem) { - this.resultItems.push(textItem); - } - - - getResults() { - if (this.openFormat) { - this.closeOpenFormat(); - } - return this.resultItems; - } - - consume(item) { - var newItem; - - const handleFreshUnopened = () => { - item.annotation = REMOVED_ANNOTATION; - newItem = textItemWithOpening(item, item.unopenedFormat); - } - - const handleFreshLine = () => { - item.annotation = REMOVED_ANNOTATION; - newItem = textItemWithOpening(item, item.lineFormat); - this.cache(newItem, item.lineFormat); - } - - const handleFreshUnclosed = () => { - if (newItem) { - this.cache(newItem, item.unclosedFormat); - newItem = null; - } else { - this.cache(item, item.unclosedFormat); - } - } - - //flush open format if possible - if (this.openFormat) { - if (item.unopenedFormat) { - if (item.unopenedFormat === this.openFormat) { - //good, closing an opened - this.clear(); - } else { - this.closeOpenFormat(); - handleFreshUnopened(); - } - } - - if (item.lineFormat) { - if (item.lineFormat === this.openFormat) { - this.cache(item, item.lineFormat); - } else { - this.closeOpenFormat(); - handleFreshLine(); - } - } - - if (item.unclosedFormat) { - this.closeOpenFormat(); - handleFreshUnclosed(); - } - - if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) { - this.closeOpenFormat(); - } - - } else { // handle fresh items - if (item.unopenedFormat) { - handleFreshUnopened() - } - if (item.lineFormat) { - handleFreshLine(); - } - if (item.unclosedFormat) { - handleFreshUnclosed(); - } - } - - this.writeToResults(item); - if (newItem) { - this.writeToResults(newItem); - } - } -} - -function textItemWithOpening(textItem, format) { - return new TextItem({ - ...textItem, - text: format.startSymbol + textItem.text, - annotation: ADDED_ANNOTATION - }); -} - -function textItemWithClosing(textItem, format) { - return new TextItem({ - ...textItem, - text: textItem.text + format.endSymbol, - annotation: ADDED_ANNOTATION - }); -}