mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-24 00:33:48 +01:00
separate type and format for a word
This commit is contained in:
parent
9dbc57b4fe
commit
106e2bfa8e
@ -5,7 +5,6 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
|
||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
|
||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
@ -56,7 +55,6 @@ export default class AppState {
|
||||
new CompactLines(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
// new PostprocessLines(),
|
||||
new DetectTOC(),
|
||||
new DetectHeaders(),
|
||||
new DetectListItems(),
|
||||
|
@ -41,7 +41,8 @@ export default class LineConverter {
|
||||
parsedElements: new ParsedElements({
|
||||
footnoteLinks: wordStream.footnoteLinks,
|
||||
footnotes: wordStream.footnotes,
|
||||
containLinks: wordStream.containLinks
|
||||
containLinks: wordStream.containLinks,
|
||||
formattedWords: wordStream.formattedWords
|
||||
})
|
||||
});
|
||||
|
||||
@ -116,27 +117,25 @@ class WordDetectionStream extends StashingStream {
|
||||
|
||||
itemsToWords(items, format) {
|
||||
const combinedText = combineText(items);
|
||||
// const combinedText = items.map(textItem => textItem.text).join('');
|
||||
const words = combinedText.split(' ');
|
||||
return words.filter(w => w.trim().length > 0).map(word => {
|
||||
var type = null;
|
||||
if (word.startsWith('http:')) {
|
||||
this.containLinks = true;
|
||||
return new Word({
|
||||
string: word,
|
||||
type: WordType.LINK
|
||||
});
|
||||
type = WordType.LINK;
|
||||
} else if (word.startsWith('www.')) {
|
||||
this.containLinks = true;
|
||||
word = `http://${word}`
|
||||
return new Word({
|
||||
string: word,
|
||||
type: WordType.LINK
|
||||
});
|
||||
type = WordType.LINK;
|
||||
}
|
||||
|
||||
if (format) {
|
||||
this.formattedWords++;
|
||||
}
|
||||
return new Word({
|
||||
string: word,
|
||||
type: format
|
||||
type: type,
|
||||
format: format
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -18,12 +18,14 @@ export class ParsedElements {
|
||||
this.footnoteLinks = options.footnoteLinks || [];
|
||||
this.footnotes = options.footnotes || [];
|
||||
this.containLinks = options.containLinks;
|
||||
this.formattedWords = options.formattedWords;
|
||||
}
|
||||
|
||||
add(parsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
this.containLinks = this.containLinks || parsedElements.containLinks;
|
||||
this.formattedWords += parsedElements.formattedWords;
|
||||
}
|
||||
|
||||
}
|
@ -3,6 +3,7 @@ export default class Word {
|
||||
constructor(options) {
|
||||
this.string = options.string;
|
||||
this.type = options.type; // WordType
|
||||
this.format = options.format; // WordFormat
|
||||
}
|
||||
|
||||
}
|
21
src/javascript/models/markdown/WordFormat.jsx
Normal file
21
src/javascript/models/markdown/WordFormat.jsx
Normal file
@ -0,0 +1,21 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
// The format of a word element
|
||||
export default class WordFormat extends Enum {
|
||||
|
||||
}
|
||||
|
||||
WordFormat.initEnum({
|
||||
BOLD: {
|
||||
startSymbol: '**',
|
||||
endSymbol: '**',
|
||||
},
|
||||
OBLIQUE: {
|
||||
startSymbol: '_',
|
||||
endSymbol: '_',
|
||||
},
|
||||
BOLD_OBLIQUE: {
|
||||
startSymbol: '**_',
|
||||
endSymbol: '_**',
|
||||
}
|
||||
});
|
@ -23,21 +23,6 @@ WordType.initEnum({
|
||||
toText(string) {
|
||||
return `(^${string})`
|
||||
}
|
||||
},
|
||||
BOLD: {
|
||||
format: true,
|
||||
startSymbol: '**',
|
||||
endSymbol: '**',
|
||||
},
|
||||
OBLIQUE: {
|
||||
format: true,
|
||||
startSymbol: '_',
|
||||
endSymbol: '_',
|
||||
},
|
||||
BOLD_OBLIQUE: {
|
||||
format: true,
|
||||
startSymbol: '**_',
|
||||
endSymbol: '_**',
|
||||
}
|
||||
});
|
||||
|
||||
@ -53,23 +38,22 @@ export function linesToText(lineItems, disableInlineFormats) {
|
||||
lineItems.forEach((line, lineIndex) => {
|
||||
line.words.forEach((word, i) => {
|
||||
const wordType = word.type;
|
||||
if (openFormat && (!wordType || wordType !== openFormat)) {
|
||||
const wordFormat = word.format;
|
||||
if (openFormat && (!wordFormat || wordFormat !== openFormat)) {
|
||||
closeFormat();
|
||||
}
|
||||
|
||||
if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
|
||||
text += ' ';
|
||||
}
|
||||
|
||||
if (wordFormat && !openFormat && (!disableInlineFormats)) {
|
||||
openFormat = wordFormat;
|
||||
text += openFormat.startSymbol;
|
||||
}
|
||||
|
||||
if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
|
||||
if (wordType.format) {
|
||||
if (!openFormat) {
|
||||
openFormat = wordType;
|
||||
text += openFormat.startSymbol;
|
||||
}
|
||||
text += word.string;
|
||||
} else {
|
||||
text += wordType.toText(word.string);
|
||||
}
|
||||
text += wordType.toText(word.string);
|
||||
} else {
|
||||
text += word.string;
|
||||
}
|
||||
@ -86,7 +70,7 @@ function firstFormat(lineItem) {
|
||||
if (lineItem.words.length == 0) {
|
||||
return null;
|
||||
}
|
||||
return lineItem.words[0].type;
|
||||
return lineItem.words[0].format;
|
||||
}
|
||||
|
||||
function isPunctationCharacter(string) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import WordType from '../../markdown/WordType.jsx';
|
||||
import WordFormat from '../../markdown/WordFormat.jsx';
|
||||
|
||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
|
||||
@ -54,20 +54,20 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
this.fontMap.forEach(function(value, key) {
|
||||
fontIdToName.push(key + " = " + value.name)
|
||||
const fontName = value.name.toLowerCase();
|
||||
var type;
|
||||
var format;
|
||||
if (key == mostUsedFont) {
|
||||
type = null;
|
||||
format = null;
|
||||
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||
type = WordType.BOLD_OBLIQUE;
|
||||
format = WordFormat.BOLD_OBLIQUE;
|
||||
} else if (fontName.includes('bold')) {
|
||||
type = WordType.BOLD;
|
||||
format = WordFormat.BOLD;
|
||||
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||
type = WordType.OBLIQUE;
|
||||
format = WordFormat.OBLIQUE;
|
||||
} else if (fontName === maxHeightFont) {
|
||||
type = WordType.BOLD;
|
||||
format = WordFormat.BOLD;
|
||||
}
|
||||
if (type) {
|
||||
fontToFormats.set(key, type);
|
||||
if (format) {
|
||||
fontToFormats.set(key, format);
|
||||
}
|
||||
});
|
||||
fontIdToName.sort();
|
||||
|
@ -48,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation {
|
||||
}
|
||||
lineItems.push(lineItem);
|
||||
|
||||
if (lineItem.parsedElements.formattedWords) {
|
||||
formattedWords += lineItem.parsedElements.formattedWords;
|
||||
}
|
||||
if (lineItem.parsedElements.containLinks > 0) {
|
||||
linkCount++;
|
||||
}
|
||||
|
@ -1,75 +0,0 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import { ParsedElements } from '../../PageItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
|
||||
// Remove whitespace, detect links, etc...
|
||||
export default class PostprocessLines extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Whitespace & Detect Links");
|
||||
this.showWhitespaces = true;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var strippedWhitespace = 0;
|
||||
var foundLinks = 0;
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
const newItems = [];
|
||||
page.items.forEach(lineItem => {
|
||||
newItems.push(lineItem);
|
||||
var words = lineItem.text.split(' ');
|
||||
var newWords = [];
|
||||
var foundSuperflousNewLine = false;
|
||||
var foundLink = false;
|
||||
words.forEach(word => {
|
||||
if (word.trim().length == 0) {
|
||||
foundSuperflousNewLine = true;
|
||||
strippedWhitespace++;
|
||||
} else {
|
||||
if (word.startsWith('http:')) {
|
||||
foundLinks++;
|
||||
foundLink = true;
|
||||
newWords.push(`[${word}](${word})`);
|
||||
} else if (word.startsWith('www.')) {
|
||||
foundLinks++;
|
||||
foundLink = true;
|
||||
newWords.push(`[http://${word}](http://${word})`);
|
||||
} else {
|
||||
newWords.push(word);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (foundSuperflousNewLine || foundLink) {
|
||||
lineItem.annotation = REMOVED_ANNOTATION;
|
||||
if (newWords.length > 0) {
|
||||
newItems.push(new TextItem({
|
||||
...lineItem,
|
||||
text: newWords.join(' '),
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: new ParsedElements({
|
||||
...lineItem.parsedElements,
|
||||
containLinks: foundLink
|
||||
})
|
||||
}));
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newItems;
|
||||
});
|
||||
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
|
||||
'Found ' + foundLinks + ' links',
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user