diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index 14e7be4..e8b6b16 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -9,7 +9,6 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
-import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
@@ -60,7 +59,6 @@ export default class AppState {
// new PostprocessLines(),
new DetectTOC(),
new DetectHeaders(),
- // new CompleteFormats(),
new DetectListItems(),
new GatherBlocks(),
diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx
index 6463a04..b29dddc 100644
--- a/src/javascript/models/ElementType.jsx
+++ b/src/javascript/models/ElementType.jsx
@@ -1,5 +1,5 @@
import { Enum } from 'enumify';
-import LineItem from './LineItem.jsx';
+import { linesToText } from './markdown/WordType.jsx';
import LineItemBlock from './LineItemBlock.jsx';
// An Markdown element
@@ -13,73 +13,73 @@ ElementType.initEnum({
headline: true,
headlineLevel: 1,
toText(block:LineItemBlock) {
- return '# ' + concatLineItems(block.items);
+ return '# ' + linesToText(block.items, true);
}
},
H2: {
headline: true,
headlineLevel: 2,
toText(block:LineItemBlock) {
- return '## ' + concatLineItems(block.items);
+ return '## ' + linesToText(block.items, true);
}
},
H3: {
headline: true,
headlineLevel: 3,
toText(block:LineItemBlock) {
- return '### ' + concatLineItems(block.items);
+ return '### ' + linesToText(block.items, true);
}
},
H4: {
headline: true,
headlineLevel: 4,
toText(block:LineItemBlock) {
- return '#### ' + concatLineItems(block.items);
+ return '#### ' + linesToText(block.items, true);
}
},
H5: {
headline: true,
headlineLevel: 5,
toText(block:LineItemBlock) {
- return '##### ' + concatLineItems(block.items);
+ return '##### ' + linesToText(block.items, true);
}
},
H6: {
headline: true,
headlineLevel: 6,
toText(block:LineItemBlock) {
- return '###### ' + concatLineItems(block.items);
+ return '###### ' + linesToText(block.items, true);
}
},
TOC: {
mergeToBlock: true,
toText(block:LineItemBlock) {
- return concatLineItems(block.items);
+ return linesToText(block.items, true);
}
},
FOOTNOTES: {
mergeToBlock: true,
mergeFollowingNonTypedItems: true,
toText(block:LineItemBlock) {
- return concatLineItems(block.items);
+ return linesToText(block.items, false);
}
},
CODE: {
mergeToBlock: true,
toText(block:LineItemBlock) {
- return '```\n' + concatLineItems(block.items) + '```'
+ return '```\n' + linesToText(block.items, true) + '```'
}
},
LIST: {
mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true,
toText(block:LineItemBlock) {
- return concatLineItems(block.items);
+ return linesToText(block.items, false);
}
},
PARAGRAPH: {
toText(block:LineItemBlock) {
- return concatLineItems(block.items);
+ return linesToText(block.items, false);
}
}
});
@@ -90,19 +90,11 @@ export function isHeadline(elementType: ElementType) {
export function blockToText(block: LineItemBlock) {
if (!block.type) {
- return concatLineItems(block.items);
+ return linesToText(block.items, false);
}
return block.type.toText(block);
}
-function concatLineItems(lineItems: LineItem[]) {
- var text = '';
- lineItems.forEach(item => {
- text += item.text() + '\n';
- });
- return text;
-}
-
export function headlineByLevel(level) {
if (level == 1) {
return ElementType.H1;
diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx
index 5ea47ee..66634a8 100644
--- a/src/javascript/models/LineConverter.jsx
+++ b/src/javascript/models/LineConverter.jsx
@@ -40,7 +40,8 @@ export default class LineConverter {
words: words,
parsedElements: new ParsedElements({
footnoteLinks: wordStream.footnoteLinks,
- footnotes: wordStream.footnotes
+ footnotes: wordStream.footnotes,
+ containLinks: wordStream.containLinks
})
});
@@ -48,34 +49,6 @@ export default class LineConverter {
}
-function itemsToWords(items, format) {
- const combinedText = combineText(items);
- // const combinedText = items.map(textItem => textItem.text).join('');
- const words = combinedText.split(' ');
- return words.filter(w => w.trim().length > 0).map(word => {
- return new Word({
- string: word,
- type: format
- });
- });
-}
-
-function combineText(textItems) {
- var text = '';
- var lastItem;
- textItems.forEach(textItem => {
- if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
- const xDistance = textItem.x - lastItem.x - lastItem.width;
- if (xDistance > 5) {
- text += ' ';
- }
- }
- text += textItem.text;
- lastItem = textItem;
- });
- return text;
-}
-
class WordDetectionStream extends StashingStream {
constructor(fontToFormats) {
@@ -83,6 +56,8 @@ class WordDetectionStream extends StashingStream {
this.fontToFormats = fontToFormats;
this.footnoteLinks = [];
this.footnotes = [];
+ this.formattedWords = 0
+ this.containLinks = false;
this.firstY;
this.stashedNumber = false;
@@ -113,21 +88,17 @@ class WordDetectionStream extends StashingStream {
doFlushStash(stash, results) {
if (this.stashedNumber) {
- const joinedNumber = stash.map(item => item.text).join('');
+ const joinedNumber = stash.map(item => item.text).join('').trim();
if (stash[0].y > this.firstY) { // footnote link
results.push(new Word({
string: `${joinedNumber}`,
type: WordType.FOOTNOTE_LINK
- //TODO format to
- //^
- //`[${joinedNumber}](#${joinedNumber})`
}));
this.footnoteLinks.push(parseInt(joinedNumber));
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
results.push(new Word({
string: `${joinedNumber}`,
type: WordType.FOOTNOTE
- //TODO format to (^${ joinedNumber}):
}));
this.footnotes.push(joinedNumber);
} else {
@@ -140,6 +111,50 @@ class WordDetectionStream extends StashingStream {
copyStashItemsAsText(stash, results) {
const format = this.fontToFormats.get(stash[0].font);
- results.push(...itemsToWords(stash, format));
+ results.push(...this.itemsToWords(stash, format));
+ }
+
+ itemsToWords(items, format) {
+ const combinedText = combineText(items);
+ // const combinedText = items.map(textItem => textItem.text).join('');
+ const words = combinedText.split(' ');
+ return words.filter(w => w.trim().length > 0).map(word => {
+ if (word.startsWith('http:')) {
+ this.containLinks = true;
+ return new Word({
+ string: word,
+ type: WordType.LINK
+ });
+ } else if (word.startsWith('www.')) {
+ this.containLinks = true;
+ word = `http://${word}`
+ return new Word({
+ string: word,
+ type: WordType.LINK
+ });
+ }
+
+ return new Word({
+ string: word,
+ type: format
+ });
+ });
}
}
+
+
+function combineText(textItems) {
+ var text = '';
+ var lastItem;
+ textItems.forEach(textItem => {
+ if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
+ const xDistance = textItem.x - lastItem.x - lastItem.width;
+ if (xDistance > 5) {
+ text += ' ';
+ }
+ }
+ text += textItem.text;
+ lastItem = textItem;
+ });
+ return text;
+}
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index 72f465d..fe9b61b 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -18,14 +18,12 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks;
- this.inlineFormats = options.inlineFormats || 0;
}
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks;
- this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
}
}
\ No newline at end of file
diff --git a/src/javascript/models/StringFormat.jsx b/src/javascript/models/StringFormat.jsx
deleted file mode 100644
index 3518c2f..0000000
--- a/src/javascript/models/StringFormat.jsx
+++ /dev/null
@@ -1,25 +0,0 @@
-import { Enum } from 'enumify';
-
-export default class StringFormat extends Enum {
-}
-
-StringFormat.initEnum({
- STANDARD: {
- needFormat: false
- },
- BOLD: {
- needFormat: true,
- startSymbol: ' **',
- endSymbol: '** '
- },
- OBLIQUE: {
- needFormat: true,
- startSymbol: ' _',
- endSymbol: '_ '
- },
- BOLD_OBLIQUE: {
- needFormat: true,
- startSymbol: ' **_',
- endSymbol: '_** '
- }
-})
\ No newline at end of file
diff --git a/src/javascript/models/markdown/WordType.jsx b/src/javascript/models/markdown/WordType.jsx
index 64a531d..7964375 100644
--- a/src/javascript/models/markdown/WordType.jsx
+++ b/src/javascript/models/markdown/WordType.jsx
@@ -2,6 +2,96 @@ import { Enum } from 'enumify';
// An Markdown word element
export default class WordType extends Enum {
+
}
-WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']);
\ No newline at end of file
+WordType.initEnum({
+ LINK: {
+ toText(string) {
+ return `[${string}](${string})`
+ }
+ },
+ FOOTNOTE_LINK: {
+ attachWithoutWhitespace: true,
+ plainTextFormat: true,
+ toText(string) {
+ return `^${string}`
+ // return `[${string}](#${string})`;
+ }
+ },
+ FOOTNOTE: {
+ toText(string) {
+ return `(^${string})`
+ }
+ },
+ BOLD: {
+ format: true,
+ startSymbol: '**',
+ endSymbol: '**',
+ },
+ OBLIQUE: {
+ format: true,
+ startSymbol: '_',
+ endSymbol: '_',
+ },
+ BOLD_OBLIQUE: {
+ format: true,
+ startSymbol: '**_',
+ endSymbol: '_**',
+ }
+});
+
+export function linesToText(lineItems, disableInlineFormats) {
+ var text = '';
+ var openFormat;
+
+ const closeFormat = () => {
+ text += openFormat.endSymbol;
+ openFormat = null;
+ };
+
+ lineItems.forEach((line, lineIndex) => {
+ line.words.forEach((word, i) => {
+ const wordType = word.type;
+ if (openFormat && (!wordType || wordType !== openFormat)) {
+ closeFormat();
+ }
+
+ if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
+ text += ' ';
+ }
+ if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
+ if (wordType.format) {
+ if (!openFormat) {
+ openFormat = wordType;
+ text += openFormat.startSymbol;
+ }
+ text += word.string;
+ } else {
+ text += wordType.toText(word.string);
+ }
+ } else {
+ text += word.string;
+ }
+ });
+ if (openFormat && (lineIndex == lineItems.length - 1 || firstFormat(lineItems[lineIndex + 1]) !== openFormat)) {
+ closeFormat();
+ }
+ text += '\n';
+ });
+ return text;
+}
+
+function firstFormat(lineItem) {
+ if (lineItem.words.length == 0) {
+ return null;
+ }
+ return lineItem.words[0].type;
+}
+
+function isPunctationCharacter(string) {
+ if (string.length != 1) {
+ return false;
+ }
+ return string[0] === '.' || string[0] === '!' || string[0] === '?';
+}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
index 7a06d00..61a30ed 100644
--- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
+++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
@@ -1,7 +1,6 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import WordType from '../../markdown/WordType.jsx';
-// import StringFormat from '../../StringFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation {
diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx
index 12446bb..2c27aea 100644
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@@ -20,6 +20,7 @@ export default class CompactLines extends ToLineItemTransformation {
const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = [];
const foundFootnoteLinks = [];
+ var linkCount = 0;
var formattedWords = 0;
const lineGrouper = new TextItemLineGrouper({
@@ -47,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation {
}
lineItems.push(lineItem);
+ if (lineItem.parsedElements.containLinks > 0) {
+ linkCount++;
+ }
if (lineItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
@@ -66,6 +70,7 @@ export default class CompactLines extends ToLineItemTransformation {
...parseResult,
messages: [
'Detected ' + formattedWords + ' formatted words',
+ 'Found ' + linkCount + ' links',
Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }],
Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }],
]
diff --git a/src/javascript/models/transformations/textitem/CompleteFormats.jsx b/src/javascript/models/transformations/textitem/CompleteFormats.jsx
deleted file mode 100644
index e6ac7f8..0000000
--- a/src/javascript/models/transformations/textitem/CompleteFormats.jsx
+++ /dev/null
@@ -1,171 +0,0 @@
-import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
-import ParseResult from '../../ParseResult.jsx';
-import TextItem from '../../TextItem.jsx';
-import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
-
-//Complete unopened/unclosed bold/italic formats
-export default class CompleteFormats extends ToTextItemTransformation {
-
- //TODO move to block and ignore quotes
-
- constructor() {
- super("Complete Bold/Italics");
- }
-
- transform(parseResult:ParseResult) {
- // remove line formats from headers
- parseResult.pages.forEach(page => {
- page.items.forEach(item => {
- if (item.type && item.type.headline) {
- if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
- item.lineFormat = null;
- item.unopenedFormat = null;
- item.unclosedFormat = null;
- item.annotation = UNCHANGED_ANNOTATION;
- }
- }
- });
- });
-
- //close open formats
- parseResult.pages.forEach(page => {
- const itemStack = new ItemStack();
- page.items.forEach(item => {
- itemStack.consume(item);
- });
- page.items = itemStack.getResults();
- });
- return new ParseResult({
- ...parseResult,
- messages: []
- });
-
- }
-
-}
-
-class ItemStack {
-
- constructor() {
- this.openFormat;
- this.openFormatItem = [];
- this.resultItems = [];
- }
-
- cache(textItem, format) {
- this.openFormat = format;
- this.openFormatItem = textItem;
- }
-
- closeOpenFormat() {
- if (this.openFormat) {
- this.openFormatItem.annotation = REMOVED_ANNOTATION;
- this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
- this.clear();
- }
- }
-
- clear() {
- this.openFormat = null;
- this.openFormatItem = null;
- }
-
- writeToResults(textItem) {
- this.resultItems.push(textItem);
- }
-
-
- getResults() {
- if (this.openFormat) {
- this.closeOpenFormat();
- }
- return this.resultItems;
- }
-
- consume(item) {
- var newItem;
-
- const handleFreshUnopened = () => {
- item.annotation = REMOVED_ANNOTATION;
- newItem = textItemWithOpening(item, item.unopenedFormat);
- }
-
- const handleFreshLine = () => {
- item.annotation = REMOVED_ANNOTATION;
- newItem = textItemWithOpening(item, item.lineFormat);
- this.cache(newItem, item.lineFormat);
- }
-
- const handleFreshUnclosed = () => {
- if (newItem) {
- this.cache(newItem, item.unclosedFormat);
- newItem = null;
- } else {
- this.cache(item, item.unclosedFormat);
- }
- }
-
- //flush open format if possible
- if (this.openFormat) {
- if (item.unopenedFormat) {
- if (item.unopenedFormat === this.openFormat) {
- //good, closing an opened
- this.clear();
- } else {
- this.closeOpenFormat();
- handleFreshUnopened();
- }
- }
-
- if (item.lineFormat) {
- if (item.lineFormat === this.openFormat) {
- this.cache(item, item.lineFormat);
- } else {
- this.closeOpenFormat();
- handleFreshLine();
- }
- }
-
- if (item.unclosedFormat) {
- this.closeOpenFormat();
- handleFreshUnclosed();
- }
-
- if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
- this.closeOpenFormat();
- }
-
- } else { // handle fresh items
- if (item.unopenedFormat) {
- handleFreshUnopened()
- }
- if (item.lineFormat) {
- handleFreshLine();
- }
- if (item.unclosedFormat) {
- handleFreshUnclosed();
- }
- }
-
- this.writeToResults(item);
- if (newItem) {
- this.writeToResults(newItem);
- }
- }
-}
-
-function textItemWithOpening(textItem, format) {
- return new TextItem({
- ...textItem,
- text: format.startSymbol + textItem.text,
- annotation: ADDED_ANNOTATION
- });
-}
-
-function textItemWithClosing(textItem, format) {
- return new TextItem({
- ...textItem,
- text: textItem.text + format.endSymbol,
- annotation: ADDED_ANNOTATION
- });
-}