separate type and format for a word

This commit is contained in:
Johannes Zillmann 2017-03-28 08:15:27 +02:00
parent 9dbc57b4fe
commit 106e2bfa8e
9 changed files with 56 additions and 123 deletions

View File

@ -5,7 +5,6 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
import CompactLines from './transformations/textitem/CompactLines.jsx'; import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@ -56,7 +55,6 @@ export default class AppState {
new CompactLines(), new CompactLines(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
// new PostprocessLines(),
new DetectTOC(), new DetectTOC(),
new DetectHeaders(), new DetectHeaders(),
new DetectListItems(), new DetectListItems(),

View File

@ -41,7 +41,8 @@ export default class LineConverter {
parsedElements: new ParsedElements({ parsedElements: new ParsedElements({
footnoteLinks: wordStream.footnoteLinks, footnoteLinks: wordStream.footnoteLinks,
footnotes: wordStream.footnotes, footnotes: wordStream.footnotes,
containLinks: wordStream.containLinks containLinks: wordStream.containLinks,
formattedWords: wordStream.formattedWords
}) })
}); });
@ -116,27 +117,25 @@ class WordDetectionStream extends StashingStream {
itemsToWords(items, format) { itemsToWords(items, format) {
const combinedText = combineText(items); const combinedText = combineText(items);
// const combinedText = items.map(textItem => textItem.text).join('');
const words = combinedText.split(' '); const words = combinedText.split(' ');
return words.filter(w => w.trim().length > 0).map(word => { return words.filter(w => w.trim().length > 0).map(word => {
var type = null;
if (word.startsWith('http:')) { if (word.startsWith('http:')) {
this.containLinks = true; this.containLinks = true;
return new Word({ type = WordType.LINK;
string: word,
type: WordType.LINK
});
} else if (word.startsWith('www.')) { } else if (word.startsWith('www.')) {
this.containLinks = true; this.containLinks = true;
word = `http://${word}` word = `http://${word}`
return new Word({ type = WordType.LINK;
string: word,
type: WordType.LINK
});
} }
if (format) {
this.formattedWords++;
}
return new Word({ return new Word({
string: word, string: word,
type: format type: type,
format: format
}); });
}); });
} }

View File

@ -18,12 +18,14 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || []; this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || []; this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks; this.containLinks = options.containLinks;
this.formattedWords = options.formattedWords;
} }
add(parsedElements) { add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks; this.containLinks = this.containLinks || parsedElements.containLinks;
this.formattedWords += parsedElements.formattedWords;
} }
} }

View File

@ -3,6 +3,7 @@ export default class Word {
constructor(options) { constructor(options) {
this.string = options.string; this.string = options.string;
this.type = options.type; // WordType this.type = options.type; // WordType
this.format = options.format; // WordFormat
} }
} }

View File

@ -0,0 +1,21 @@
import { Enum } from 'enumify';
// The format of a word element
export default class WordFormat extends Enum {
}
WordFormat.initEnum({
BOLD: {
startSymbol: '**',
endSymbol: '**',
},
OBLIQUE: {
startSymbol: '_',
endSymbol: '_',
},
BOLD_OBLIQUE: {
startSymbol: '**_',
endSymbol: '_**',
}
});

View File

@ -23,21 +23,6 @@ WordType.initEnum({
toText(string) { toText(string) {
return `(^${string})` return `(^${string})`
} }
},
BOLD: {
format: true,
startSymbol: '**',
endSymbol: '**',
},
OBLIQUE: {
format: true,
startSymbol: '_',
endSymbol: '_',
},
BOLD_OBLIQUE: {
format: true,
startSymbol: '**_',
endSymbol: '_**',
} }
}); });
@ -53,23 +38,22 @@ export function linesToText(lineItems, disableInlineFormats) {
lineItems.forEach((line, lineIndex) => { lineItems.forEach((line, lineIndex) => {
line.words.forEach((word, i) => { line.words.forEach((word, i) => {
const wordType = word.type; const wordType = word.type;
if (openFormat && (!wordType || wordType !== openFormat)) { const wordFormat = word.format;
if (openFormat && (!wordFormat || wordFormat !== openFormat)) {
closeFormat(); closeFormat();
} }
if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) { if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
text += ' '; text += ' ';
} }
if (wordFormat && !openFormat && (!disableInlineFormats)) {
openFormat = wordFormat;
text += openFormat.startSymbol;
}
if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) { if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
if (wordType.format) { text += wordType.toText(word.string);
if (!openFormat) {
openFormat = wordType;
text += openFormat.startSymbol;
}
text += word.string;
} else {
text += wordType.toText(word.string);
}
} else { } else {
text += word.string; text += word.string;
} }
@ -86,7 +70,7 @@ function firstFormat(lineItem) {
if (lineItem.words.length == 0) { if (lineItem.words.length == 0) {
return null; return null;
} }
return lineItem.words[0].type; return lineItem.words[0].format;
} }
function isPunctationCharacter(string) { function isPunctationCharacter(string) {

View File

@ -1,6 +1,6 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import WordType from '../../markdown/WordType.jsx'; import WordFormat from '../../markdown/WordFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation { export default class CalculateGlobalStats extends ToTextItemTransformation {
@ -54,20 +54,20 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
this.fontMap.forEach(function(value, key) { this.fontMap.forEach(function(value, key) {
fontIdToName.push(key + " = " + value.name) fontIdToName.push(key + " = " + value.name)
const fontName = value.name.toLowerCase(); const fontName = value.name.toLowerCase();
var type; var format;
if (key == mostUsedFont) { if (key == mostUsedFont) {
type = null; format = null;
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) { } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
type = WordType.BOLD_OBLIQUE; format = WordFormat.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) { } else if (fontName.includes('bold')) {
type = WordType.BOLD; format = WordFormat.BOLD;
} else if (fontName.includes('oblique') || fontName.includes('italic')) { } else if (fontName.includes('oblique') || fontName.includes('italic')) {
type = WordType.OBLIQUE; format = WordFormat.OBLIQUE;
} else if (fontName === maxHeightFont) { } else if (fontName === maxHeightFont) {
type = WordType.BOLD; format = WordFormat.BOLD;
} }
if (type) { if (format) {
fontToFormats.set(key, type); fontToFormats.set(key, format);
} }
}); });
fontIdToName.sort(); fontIdToName.sort();

View File

@ -48,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation {
} }
lineItems.push(lineItem); lineItems.push(lineItem);
if (lineItem.parsedElements.formattedWords) {
formattedWords += lineItem.parsedElements.formattedWords;
}
if (lineItem.parsedElements.containLinks > 0) { if (lineItem.parsedElements.containLinks > 0) {
linkCount++; linkCount++;
} }

View File

@ -1,75 +0,0 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { ParsedElements } from '../../PageItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// Remove whitespace, detect links, etc...
export default class PostprocessLines extends ToTextItemTransformation {
constructor() {
super("Remove Whitespace & Detect Links");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
var strippedWhitespace = 0;
var foundLinks = 0;
parseResult.pages.forEach(page => {
const newItems = [];
page.items.forEach(lineItem => {
newItems.push(lineItem);
var words = lineItem.text.split(' ');
var newWords = [];
var foundSuperflousNewLine = false;
var foundLink = false;
words.forEach(word => {
if (word.trim().length == 0) {
foundSuperflousNewLine = true;
strippedWhitespace++;
} else {
if (word.startsWith('http:')) {
foundLinks++;
foundLink = true;
newWords.push(`[${word}](${word})`);
} else if (word.startsWith('www.')) {
foundLinks++;
foundLink = true;
newWords.push(`[http://${word}](http://${word})`);
} else {
newWords.push(word);
}
}
});
if (foundSuperflousNewLine || foundLink) {
lineItem.annotation = REMOVED_ANNOTATION;
if (newWords.length > 0) {
newItems.push(new TextItem({
...lineItem,
text: newWords.join(' '),
annotation: ADDED_ANNOTATION,
parsedElements: new ParsedElements({
...lineItem.parsedElements,
containLinks: foundLink
})
}));
}
}
});
page.items = newItems;
});
return new ParseResult({
...parseResult,
messages: [
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
'Found ' + foundLinks + ' links',
]
});
}
}