mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-27 18:23:36 +01:00
separate type and format for a word
This commit is contained in:
parent
9dbc57b4fe
commit
106e2bfa8e
@ -5,7 +5,6 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
|
|||||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||||
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
|
|
||||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||||
@ -56,7 +55,6 @@ export default class AppState {
|
|||||||
new CompactLines(),
|
new CompactLines(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
// new PostprocessLines(),
|
|
||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
new DetectHeaders(),
|
new DetectHeaders(),
|
||||||
new DetectListItems(),
|
new DetectListItems(),
|
||||||
|
@ -41,7 +41,8 @@ export default class LineConverter {
|
|||||||
parsedElements: new ParsedElements({
|
parsedElements: new ParsedElements({
|
||||||
footnoteLinks: wordStream.footnoteLinks,
|
footnoteLinks: wordStream.footnoteLinks,
|
||||||
footnotes: wordStream.footnotes,
|
footnotes: wordStream.footnotes,
|
||||||
containLinks: wordStream.containLinks
|
containLinks: wordStream.containLinks,
|
||||||
|
formattedWords: wordStream.formattedWords
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -116,27 +117,25 @@ class WordDetectionStream extends StashingStream {
|
|||||||
|
|
||||||
itemsToWords(items, format) {
|
itemsToWords(items, format) {
|
||||||
const combinedText = combineText(items);
|
const combinedText = combineText(items);
|
||||||
// const combinedText = items.map(textItem => textItem.text).join('');
|
|
||||||
const words = combinedText.split(' ');
|
const words = combinedText.split(' ');
|
||||||
return words.filter(w => w.trim().length > 0).map(word => {
|
return words.filter(w => w.trim().length > 0).map(word => {
|
||||||
|
var type = null;
|
||||||
if (word.startsWith('http:')) {
|
if (word.startsWith('http:')) {
|
||||||
this.containLinks = true;
|
this.containLinks = true;
|
||||||
return new Word({
|
type = WordType.LINK;
|
||||||
string: word,
|
|
||||||
type: WordType.LINK
|
|
||||||
});
|
|
||||||
} else if (word.startsWith('www.')) {
|
} else if (word.startsWith('www.')) {
|
||||||
this.containLinks = true;
|
this.containLinks = true;
|
||||||
word = `http://${word}`
|
word = `http://${word}`
|
||||||
return new Word({
|
type = WordType.LINK;
|
||||||
string: word,
|
|
||||||
type: WordType.LINK
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (format) {
|
||||||
|
this.formattedWords++;
|
||||||
|
}
|
||||||
return new Word({
|
return new Word({
|
||||||
string: word,
|
string: word,
|
||||||
type: format
|
type: type,
|
||||||
|
format: format
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -18,12 +18,14 @@ export class ParsedElements {
|
|||||||
this.footnoteLinks = options.footnoteLinks || [];
|
this.footnoteLinks = options.footnoteLinks || [];
|
||||||
this.footnotes = options.footnotes || [];
|
this.footnotes = options.footnotes || [];
|
||||||
this.containLinks = options.containLinks;
|
this.containLinks = options.containLinks;
|
||||||
|
this.formattedWords = options.formattedWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
add(parsedElements) {
|
add(parsedElements) {
|
||||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||||
this.containLinks = this.containLinks || parsedElements.containLinks;
|
this.containLinks = this.containLinks || parsedElements.containLinks;
|
||||||
|
this.formattedWords += parsedElements.formattedWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -3,6 +3,7 @@ export default class Word {
|
|||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.string = options.string;
|
this.string = options.string;
|
||||||
this.type = options.type; // WordType
|
this.type = options.type; // WordType
|
||||||
|
this.format = options.format; // WordFormat
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
21
src/javascript/models/markdown/WordFormat.jsx
Normal file
21
src/javascript/models/markdown/WordFormat.jsx
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
|
// The format of a word element
|
||||||
|
export default class WordFormat extends Enum {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
WordFormat.initEnum({
|
||||||
|
BOLD: {
|
||||||
|
startSymbol: '**',
|
||||||
|
endSymbol: '**',
|
||||||
|
},
|
||||||
|
OBLIQUE: {
|
||||||
|
startSymbol: '_',
|
||||||
|
endSymbol: '_',
|
||||||
|
},
|
||||||
|
BOLD_OBLIQUE: {
|
||||||
|
startSymbol: '**_',
|
||||||
|
endSymbol: '_**',
|
||||||
|
}
|
||||||
|
});
|
@ -23,21 +23,6 @@ WordType.initEnum({
|
|||||||
toText(string) {
|
toText(string) {
|
||||||
return `(^${string})`
|
return `(^${string})`
|
||||||
}
|
}
|
||||||
},
|
|
||||||
BOLD: {
|
|
||||||
format: true,
|
|
||||||
startSymbol: '**',
|
|
||||||
endSymbol: '**',
|
|
||||||
},
|
|
||||||
OBLIQUE: {
|
|
||||||
format: true,
|
|
||||||
startSymbol: '_',
|
|
||||||
endSymbol: '_',
|
|
||||||
},
|
|
||||||
BOLD_OBLIQUE: {
|
|
||||||
format: true,
|
|
||||||
startSymbol: '**_',
|
|
||||||
endSymbol: '_**',
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -53,23 +38,22 @@ export function linesToText(lineItems, disableInlineFormats) {
|
|||||||
lineItems.forEach((line, lineIndex) => {
|
lineItems.forEach((line, lineIndex) => {
|
||||||
line.words.forEach((word, i) => {
|
line.words.forEach((word, i) => {
|
||||||
const wordType = word.type;
|
const wordType = word.type;
|
||||||
if (openFormat && (!wordType || wordType !== openFormat)) {
|
const wordFormat = word.format;
|
||||||
|
if (openFormat && (!wordFormat || wordFormat !== openFormat)) {
|
||||||
closeFormat();
|
closeFormat();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
|
if (i > 0 && !(wordType && wordType.attachWithoutWhitespace) && !isPunctationCharacter(word.string)) {
|
||||||
text += ' ';
|
text += ' ';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (wordFormat && !openFormat && (!disableInlineFormats)) {
|
||||||
|
openFormat = wordFormat;
|
||||||
|
text += openFormat.startSymbol;
|
||||||
|
}
|
||||||
|
|
||||||
if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
|
if (wordType && (!disableInlineFormats || wordType.plainTextFormat)) {
|
||||||
if (wordType.format) {
|
text += wordType.toText(word.string);
|
||||||
if (!openFormat) {
|
|
||||||
openFormat = wordType;
|
|
||||||
text += openFormat.startSymbol;
|
|
||||||
}
|
|
||||||
text += word.string;
|
|
||||||
} else {
|
|
||||||
text += wordType.toText(word.string);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
text += word.string;
|
text += word.string;
|
||||||
}
|
}
|
||||||
@ -86,7 +70,7 @@ function firstFormat(lineItem) {
|
|||||||
if (lineItem.words.length == 0) {
|
if (lineItem.words.length == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return lineItem.words[0].type;
|
return lineItem.words[0].format;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isPunctationCharacter(string) {
|
function isPunctationCharacter(string) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import WordType from '../../markdown/WordType.jsx';
|
import WordFormat from '../../markdown/WordFormat.jsx';
|
||||||
|
|
||||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||||
|
|
||||||
@ -54,20 +54,20 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
|||||||
this.fontMap.forEach(function(value, key) {
|
this.fontMap.forEach(function(value, key) {
|
||||||
fontIdToName.push(key + " = " + value.name)
|
fontIdToName.push(key + " = " + value.name)
|
||||||
const fontName = value.name.toLowerCase();
|
const fontName = value.name.toLowerCase();
|
||||||
var type;
|
var format;
|
||||||
if (key == mostUsedFont) {
|
if (key == mostUsedFont) {
|
||||||
type = null;
|
format = null;
|
||||||
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||||
type = WordType.BOLD_OBLIQUE;
|
format = WordFormat.BOLD_OBLIQUE;
|
||||||
} else if (fontName.includes('bold')) {
|
} else if (fontName.includes('bold')) {
|
||||||
type = WordType.BOLD;
|
format = WordFormat.BOLD;
|
||||||
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||||
type = WordType.OBLIQUE;
|
format = WordFormat.OBLIQUE;
|
||||||
} else if (fontName === maxHeightFont) {
|
} else if (fontName === maxHeightFont) {
|
||||||
type = WordType.BOLD;
|
format = WordFormat.BOLD;
|
||||||
}
|
}
|
||||||
if (type) {
|
if (format) {
|
||||||
fontToFormats.set(key, type);
|
fontToFormats.set(key, format);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
fontIdToName.sort();
|
fontIdToName.sort();
|
||||||
|
@ -48,6 +48,9 @@ export default class CompactLines extends ToLineItemTransformation {
|
|||||||
}
|
}
|
||||||
lineItems.push(lineItem);
|
lineItems.push(lineItem);
|
||||||
|
|
||||||
|
if (lineItem.parsedElements.formattedWords) {
|
||||||
|
formattedWords += lineItem.parsedElements.formattedWords;
|
||||||
|
}
|
||||||
if (lineItem.parsedElements.containLinks > 0) {
|
if (lineItem.parsedElements.containLinks > 0) {
|
||||||
linkCount++;
|
linkCount++;
|
||||||
}
|
}
|
||||||
|
@ -1,75 +0,0 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
|
||||||
import TextItem from '../../TextItem.jsx';
|
|
||||||
import { ParsedElements } from '../../PageItem.jsx';
|
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
|
||||||
|
|
||||||
|
|
||||||
// Remove whitespace, detect links, etc...
|
|
||||||
export default class PostprocessLines extends ToTextItemTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Remove Whitespace & Detect Links");
|
|
||||||
this.showWhitespaces = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
var strippedWhitespace = 0;
|
|
||||||
var foundLinks = 0;
|
|
||||||
|
|
||||||
parseResult.pages.forEach(page => {
|
|
||||||
const newItems = [];
|
|
||||||
page.items.forEach(lineItem => {
|
|
||||||
newItems.push(lineItem);
|
|
||||||
var words = lineItem.text.split(' ');
|
|
||||||
var newWords = [];
|
|
||||||
var foundSuperflousNewLine = false;
|
|
||||||
var foundLink = false;
|
|
||||||
words.forEach(word => {
|
|
||||||
if (word.trim().length == 0) {
|
|
||||||
foundSuperflousNewLine = true;
|
|
||||||
strippedWhitespace++;
|
|
||||||
} else {
|
|
||||||
if (word.startsWith('http:')) {
|
|
||||||
foundLinks++;
|
|
||||||
foundLink = true;
|
|
||||||
newWords.push(`[${word}](${word})`);
|
|
||||||
} else if (word.startsWith('www.')) {
|
|
||||||
foundLinks++;
|
|
||||||
foundLink = true;
|
|
||||||
newWords.push(`[http://${word}](http://${word})`);
|
|
||||||
} else {
|
|
||||||
newWords.push(word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (foundSuperflousNewLine || foundLink) {
|
|
||||||
lineItem.annotation = REMOVED_ANNOTATION;
|
|
||||||
if (newWords.length > 0) {
|
|
||||||
newItems.push(new TextItem({
|
|
||||||
...lineItem,
|
|
||||||
text: newWords.join(' '),
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
parsedElements: new ParsedElements({
|
|
||||||
...lineItem.parsedElements,
|
|
||||||
containLinks: foundLink
|
|
||||||
})
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
page.items = newItems;
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
return new ParseResult({
|
|
||||||
...parseResult,
|
|
||||||
messages: [
|
|
||||||
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
|
|
||||||
'Found ' + foundLinks + ' links',
|
|
||||||
]
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user