[DONE] format words properly

This commit is contained in:
Johannes Zillmann 2017-03-28 06:11:42 +02:00
parent 09facb09b4
commit 9dbc57b4fe
9 changed files with 159 additions and 258 deletions

View File

@ -9,7 +9,6 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
@ -60,7 +59,6 @@ export default class AppState {
// new PostprocessLines(), // new PostprocessLines(),
new DetectTOC(), new DetectTOC(),
new DetectHeaders(), new DetectHeaders(),
// new CompleteFormats(),
new DetectListItems(), new DetectListItems(),
new GatherBlocks(), new GatherBlocks(),

View File

@ -1,5 +1,5 @@
import { Enum } from 'enumify'; import { Enum } from 'enumify';
import LineItem from './LineItem.jsx'; import { linesToText } from './markdown/WordType.jsx';
import LineItemBlock from './LineItemBlock.jsx'; import LineItemBlock from './LineItemBlock.jsx';
// An Markdown element // An Markdown element
@ -13,73 +13,73 @@ ElementType.initEnum({
headline: true, headline: true,
headlineLevel: 1, headlineLevel: 1,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '# ' + concatLineItems(block.items); return '# ' + linesToText(block.items, true);
} }
}, },
H2: { H2: {
headline: true, headline: true,
headlineLevel: 2, headlineLevel: 2,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '## ' + concatLineItems(block.items); return '## ' + linesToText(block.items, true);
} }
}, },
H3: { H3: {
headline: true, headline: true,
headlineLevel: 3, headlineLevel: 3,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '### ' + concatLineItems(block.items); return '### ' + linesToText(block.items, true);
} }
}, },
H4: { H4: {
headline: true, headline: true,
headlineLevel: 4, headlineLevel: 4,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '#### ' + concatLineItems(block.items); return '#### ' + linesToText(block.items, true);
} }
}, },
H5: { H5: {
headline: true, headline: true,
headlineLevel: 5, headlineLevel: 5,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '##### ' + concatLineItems(block.items); return '##### ' + linesToText(block.items, true);
} }
}, },
H6: { H6: {
headline: true, headline: true,
headlineLevel: 6, headlineLevel: 6,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '###### ' + concatLineItems(block.items); return '###### ' + linesToText(block.items, true);
} }
}, },
TOC: { TOC: {
mergeToBlock: true, mergeToBlock: true,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return concatLineItems(block.items); return linesToText(block.items, true);
} }
}, },
FOOTNOTES: { FOOTNOTES: {
mergeToBlock: true, mergeToBlock: true,
mergeFollowingNonTypedItems: true, mergeFollowingNonTypedItems: true,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return concatLineItems(block.items); return linesToText(block.items, false);
} }
}, },
CODE: { CODE: {
mergeToBlock: true, mergeToBlock: true,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return '```\n' + concatLineItems(block.items) + '```' return '```\n' + linesToText(block.items, true) + '```'
} }
}, },
LIST: { LIST: {
mergeToBlock: true, mergeToBlock: true,
mergeFollowingNonTypedItemsWithSmallDistance: true, mergeFollowingNonTypedItemsWithSmallDistance: true,
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return concatLineItems(block.items); return linesToText(block.items, false);
} }
}, },
PARAGRAPH: { PARAGRAPH: {
toText(block:LineItemBlock) { toText(block:LineItemBlock) {
return concatLineItems(block.items); return linesToText(block.items, false);
} }
} }
}); });
@ -90,19 +90,11 @@ export function isHeadline(elementType: ElementType) {
export function blockToText(block: LineItemBlock) { export function blockToText(block: LineItemBlock) {
if (!block.type) { if (!block.type) {
return concatLineItems(block.items); return linesToText(block.items, false);
} }
return block.type.toText(block); return block.type.toText(block);
} }
function concatLineItems(lineItems: LineItem[]) {
var text = '';
lineItems.forEach(item => {
text += item.text() + '\n';
});
return text;
}
export function headlineByLevel(level) { export function headlineByLevel(level) {
if (level == 1) { if (level == 1) {
return ElementType.H1; return ElementType.H1;

View File

@ -40,7 +40,8 @@ export default class LineConverter {
words: words, words: words,
parsedElements: new ParsedElements({ parsedElements: new ParsedElements({
footnoteLinks: wordStream.footnoteLinks, footnoteLinks: wordStream.footnoteLinks,
footnotes: wordStream.footnotes footnotes: wordStream.footnotes,
containLinks: wordStream.containLinks
}) })
}); });
@ -48,34 +49,6 @@ export default class LineConverter {
} }
function itemsToWords(items, format) {
const combinedText = combineText(items);
// const combinedText = items.map(textItem => textItem.text).join('');
const words = combinedText.split(' ');
return words.filter(w => w.trim().length > 0).map(word => {
return new Word({
string: word,
type: format
});
});
}
function combineText(textItems) {
var text = '';
var lastItem;
textItems.forEach(textItem => {
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
const xDistance = textItem.x - lastItem.x - lastItem.width;
if (xDistance > 5) {
text += ' ';
}
}
text += textItem.text;
lastItem = textItem;
});
return text;
}
class WordDetectionStream extends StashingStream { class WordDetectionStream extends StashingStream {
constructor(fontToFormats) { constructor(fontToFormats) {
@ -83,6 +56,8 @@ class WordDetectionStream extends StashingStream {
this.fontToFormats = fontToFormats; this.fontToFormats = fontToFormats;
this.footnoteLinks = []; this.footnoteLinks = [];
this.footnotes = []; this.footnotes = [];
this.formattedWords = 0
this.containLinks = false;
this.firstY; this.firstY;
this.stashedNumber = false; this.stashedNumber = false;
@ -113,21 +88,17 @@ class WordDetectionStream extends StashingStream {
doFlushStash(stash, results) { doFlushStash(stash, results) {
if (this.stashedNumber) { if (this.stashedNumber) {
const joinedNumber = stash.map(item => item.text).join(''); const joinedNumber = stash.map(item => item.text).join('').trim();
if (stash[0].y > this.firstY) { // footnote link if (stash[0].y > this.firstY) { // footnote link
results.push(new Word({ results.push(new Word({
string: `${joinedNumber}`, string: `${joinedNumber}`,
type: WordType.FOOTNOTE_LINK type: WordType.FOOTNOTE_LINK
//TODO format to
//^
//`<sup>[${joinedNumber}](#${joinedNumber})</sup>`
})); }));
this.footnoteLinks.push(parseInt(joinedNumber)); this.footnoteLinks.push(parseInt(joinedNumber));
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote } else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
results.push(new Word({ results.push(new Word({
string: `${joinedNumber}`, string: `${joinedNumber}`,
type: WordType.FOOTNOTE type: WordType.FOOTNOTE
//TODO format to (^${ joinedNumber}):
})); }));
this.footnotes.push(joinedNumber); this.footnotes.push(joinedNumber);
} else { } else {
@ -140,6 +111,50 @@ class WordDetectionStream extends StashingStream {
copyStashItemsAsText(stash, results) { copyStashItemsAsText(stash, results) {
const format = this.fontToFormats.get(stash[0].font); const format = this.fontToFormats.get(stash[0].font);
results.push(...itemsToWords(stash, format)); results.push(...this.itemsToWords(stash, format));
}
itemsToWords(items, format) {
const combinedText = combineText(items);
// const combinedText = items.map(textItem => textItem.text).join('');
const words = combinedText.split(' ');
return words.filter(w => w.trim().length > 0).map(word => {
if (word.startsWith('http:')) {
this.containLinks = true;
return new Word({
string: word,
type: WordType.LINK
});
} else if (word.startsWith('www.')) {
this.containLinks = true;
word = `http://${word}`
return new Word({
string: word,
type: WordType.LINK
});
}
return new Word({
string: word,
type: format
});
});
} }
} }
function combineText(textItems) {
var text = '';
var lastItem;
textItems.forEach(textItem => {
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) {
const xDistance = textItem.x - lastItem.x - lastItem.width;
if (xDistance > 5) {
text += ' ';
}
}
text += textItem.text;
lastItem = textItem;
});
return text;
}

View File

@ -18,14 +18,12 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || []; this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || []; this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks; this.containLinks = options.containLinks;
this.inlineFormats = options.inlineFormats || 0;
} }
add(parsedElements) { add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks; this.containLinks = this.containLinks || parsedElements.containLinks;
this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
} }
} }

View File

@ -1,25 +0,0 @@
import { Enum } from 'enumify';
export default class StringFormat extends Enum {
}
StringFormat.initEnum({
STANDARD: {
needFormat: false
},
BOLD: {
needFormat: true,
startSymbol: ' **',
endSymbol: '** '
},
OBLIQUE: {
needFormat: true,
startSymbol: ' _',
endSymbol: '_ '
},
BOLD_OBLIQUE: {
needFormat: true,
startSymbol: ' **_',
endSymbol: '_** '
}
})

View File

@ -2,6 +2,96 @@ import { Enum } from 'enumify';
// An Markdown word element // An Markdown word element
export default class WordType extends Enum { export default class WordType extends Enum {
} }
WordType.initEnum(['LINK', 'FOOTNOTE_LINK', 'FOOTNOTE', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']); WordType.initEnum({
LINK: {
toText(string) {
return `[${string}](${string})`
}
},
FOOTNOTE_LINK: {
attachWithoutWhitespace: true,
plainTextFormat: true,
toText(string) {
return `^${string}`
// return `<sup>[${string}](#${string})</sup>`;
}
},
FOOTNOTE: {
toText(string) {
return `(^${string})`
}
},
BOLD: {
format: true,
startSymbol: '**',
endSymbol: '**',
},
OBLIQUE: {
format: true,
startSymbol: '_',
endSymbol: '_',
},
BOLD_OBLIQUE: {
format: true,
startSymbol: '**_',
endSymbol: '_**',
}
});
export function linesToText(lineItems, disableInlineFormats) {
var text = '';
var openFormat;
const closeFormat = () => {
text += openFormat.endSymbol;
openFormat = null;
};
lineItems.forEach((line, lineIndex) => {
line.words.forEach((word, i) => {
const wordType = word.type;
if (openFormat && (!wordType || wordType !== openFormat)) {
closeFormat();
}
if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
text += ' ';
}
if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
if (wordType.format) {
if (!openFormat) {
openFormat = wordType;
text += openFormat.startSymbol;
}
text += word.string;
} else {
text += wordType.toText(word.string);
}
} else {
text += word.string;
}
});
if (openFormat && (lineIndex == lineItems.length - 1 || firstFormat(lineItems[lineIndex + 1]) !== openFormat)) {
closeFormat();
}
text += '\n';
});
return text;
}
function firstFormat(lineItem) {
if (lineItem.words.length == 0) {
return null;
}
return lineItem.words[0].type;
}
function isPunctationCharacter(string) {
if (string.length != 1) {
return false;
}
return string[0] === '.' || string[0] === '!' || string[0] === '?';
}

View File

@ -1,7 +1,6 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import WordType from '../../markdown/WordType.jsx'; import WordType from '../../markdown/WordType.jsx';
// import StringFormat from '../../StringFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation { export default class CalculateGlobalStats extends ToTextItemTransformation {

View File

@ -20,6 +20,7 @@ export default class CompactLines extends ToLineItemTransformation {
const {mostUsedDistance, fontToFormats} = parseResult.globals; const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = []; const foundFootnotes = [];
const foundFootnoteLinks = []; const foundFootnoteLinks = [];
var linkCount = 0;
var formattedWords = 0; var formattedWords = 0;
const lineGrouper = new TextItemLineGrouper({ const lineGrouper = new TextItemLineGrouper({
@ -47,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation {
} }
lineItems.push(lineItem); lineItems.push(lineItem);
if (lineItem.parsedElements.containLinks > 0) {
linkCount++;
}
if (lineItem.parsedElements.footnoteLinks.length > 0) { if (lineItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>); const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
@ -66,6 +70,7 @@ export default class CompactLines extends ToLineItemTransformation {
...parseResult, ...parseResult,
messages: [ messages: [
'Detected ' + formattedWords + ' formatted words', 'Detected ' + formattedWords + ' formatted words',
'Found ' + linkCount + ' links',
<span>Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }]</span>, <span>Detected { foundFootnoteLinks.length } footnotes links: [{ foundFootnoteLinks }]</span>,
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>, <span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
] ]

View File

@ -1,171 +0,0 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
//Complete unopened/unclosed bold/italic formats
export default class CompleteFormats extends ToTextItemTransformation {
//TODO move to block and ignore quotes
constructor() {
super("Complete Bold/Italics");
}
transform(parseResult:ParseResult) {
// remove line formats from headers
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (item.type && item.type.headline) {
if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
item.lineFormat = null;
item.unopenedFormat = null;
item.unclosedFormat = null;
item.annotation = UNCHANGED_ANNOTATION;
}
}
});
});
//close open formats
parseResult.pages.forEach(page => {
const itemStack = new ItemStack();
page.items.forEach(item => {
itemStack.consume(item);
});
page.items = itemStack.getResults();
});
return new ParseResult({
...parseResult,
messages: []
});
}
}
class ItemStack {
constructor() {
this.openFormat;
this.openFormatItem = [];
this.resultItems = [];
}
cache(textItem, format) {
this.openFormat = format;
this.openFormatItem = textItem;
}
closeOpenFormat() {
if (this.openFormat) {
this.openFormatItem.annotation = REMOVED_ANNOTATION;
this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
this.clear();
}
}
clear() {
this.openFormat = null;
this.openFormatItem = null;
}
writeToResults(textItem) {
this.resultItems.push(textItem);
}
getResults() {
if (this.openFormat) {
this.closeOpenFormat();
}
return this.resultItems;
}
consume(item) {
var newItem;
const handleFreshUnopened = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.unopenedFormat);
}
const handleFreshLine = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.lineFormat);
this.cache(newItem, item.lineFormat);
}
const handleFreshUnclosed = () => {
if (newItem) {
this.cache(newItem, item.unclosedFormat);
newItem = null;
} else {
this.cache(item, item.unclosedFormat);
}
}
//flush open format if possible
if (this.openFormat) {
if (item.unopenedFormat) {
if (item.unopenedFormat === this.openFormat) {
//good, closing an opened
this.clear();
} else {
this.closeOpenFormat();
handleFreshUnopened();
}
}
if (item.lineFormat) {
if (item.lineFormat === this.openFormat) {
this.cache(item, item.lineFormat);
} else {
this.closeOpenFormat();
handleFreshLine();
}
}
if (item.unclosedFormat) {
this.closeOpenFormat();
handleFreshUnclosed();
}
if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
this.closeOpenFormat();
}
} else { // handle fresh items
if (item.unopenedFormat) {
handleFreshUnopened()
}
if (item.lineFormat) {
handleFreshLine();
}
if (item.unclosedFormat) {
handleFreshUnclosed();
}
}
this.writeToResults(item);
if (newItem) {
this.writeToResults(newItem);
}
}
}
function textItemWithOpening(textItem, format) {
return new TextItem({
...textItem,
text: format.startSymbol + textItem.text,
annotation: ADDED_ANNOTATION
});
}
function textItemWithClosing(textItem, format) {
return new TextItem({
...textItem,
text: textItem.text + format.endSymbol,
annotation: ADDED_ANNOTATION
});
}