mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-08 13:59:43 +01:00
169 lines
5.4 KiB
JavaScript
169 lines
5.4 KiB
JavaScript
import TextItem from './TextItem.jsx';
|
|
import Word from './Word.jsx';
|
|
import WordType from './markdown/WordType.jsx';
|
|
import WordFormat from './markdown/WordFormat.jsx';
|
|
import LineItem from './LineItem.jsx';
|
|
import StashingStream from './StashingStream.jsx';
|
|
import { ParsedElements } from './PageItem.jsx';
|
|
import { isNumber, isListItemCharacter } from '../stringFunctions.jsx'
|
|
import { sortByX } from '../pageItemFunctions.jsx'
|
|
|
|
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
|
|
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
|
export default class LineConverter {
|
|
|
|
constructor(fontToFormats) {
|
|
this.fontToFormats = fontToFormats;
|
|
}
|
|
|
|
// returns a CombineResult
|
|
compact(textItems: TextItem[]) {
|
|
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
|
sortByX(textItems);
|
|
|
|
const wordStream = new WordDetectionStream(this.fontToFormats);
|
|
wordStream.consumeAll(textItems.map(item => new TextItem({
|
|
...item
|
|
})));
|
|
const words = wordStream.complete();
|
|
|
|
var maxHeight = 0;
|
|
var widthSum = 0;
|
|
textItems.forEach(item => {
|
|
maxHeight = Math.max(maxHeight, item.height);
|
|
widthSum += item.width;
|
|
});
|
|
return new LineItem({
|
|
x: textItems[0].x,
|
|
y: textItems[0].y,
|
|
height: maxHeight,
|
|
width: widthSum,
|
|
words: words,
|
|
parsedElements: new ParsedElements({
|
|
footnoteLinks: wordStream.footnoteLinks,
|
|
footnotes: wordStream.footnotes,
|
|
containLinks: wordStream.containLinks,
|
|
formattedWords: wordStream.formattedWords
|
|
})
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
class WordDetectionStream extends StashingStream {
|
|
|
|
constructor(fontToFormats) {
|
|
super();
|
|
this.fontToFormats = fontToFormats;
|
|
this.footnoteLinks = [];
|
|
this.footnotes = [];
|
|
this.formattedWords = 0
|
|
this.containLinks = false;
|
|
|
|
this.firstY;
|
|
this.stashedNumber = false;
|
|
this.currentItem;
|
|
}
|
|
|
|
shouldStash(item) { // eslint-disable-line no-unused-vars
|
|
if (!this.firstY) {
|
|
this.firstY = item.y;
|
|
}
|
|
this.currentItem = item;
|
|
return true;
|
|
}
|
|
|
|
onPushOnStash(item) { // eslint-disable-line no-unused-vars
|
|
this.stashedNumber = isNumber(item.text.trim());
|
|
}
|
|
|
|
doMatchesStash(lastItem, item) {
|
|
const lastItemFormat = this.fontToFormats.get(lastItem.font);
|
|
const itemFormat = this.fontToFormats.get(item.font);
|
|
if (lastItemFormat !== itemFormat) {
|
|
return false;
|
|
}
|
|
const itemIsANumber = isNumber(item.text.trim());
|
|
return this.stashedNumber == itemIsANumber;
|
|
}
|
|
|
|
doFlushStash(stash, results) {
|
|
if (this.stashedNumber) {
|
|
const joinedNumber = stash.map(item => item.text).join('').trim();
|
|
if (stash[0].y > this.firstY) { // footnote link
|
|
results.push(new Word({
|
|
string: `${joinedNumber}`,
|
|
type: WordType.FOOTNOTE_LINK
|
|
}));
|
|
this.footnoteLinks.push(parseInt(joinedNumber));
|
|
} else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote
|
|
results.push(new Word({
|
|
string: `${joinedNumber}`,
|
|
type: WordType.FOOTNOTE
|
|
}));
|
|
this.footnotes.push(joinedNumber);
|
|
} else {
|
|
this.copyStashItemsAsText(stash, results);
|
|
}
|
|
} else {
|
|
this.copyStashItemsAsText(stash, results);
|
|
}
|
|
}
|
|
|
|
copyStashItemsAsText(stash, results) {
|
|
const format = this.fontToFormats.get(stash[0].font);
|
|
results.push(...this.itemsToWords(stash, format));
|
|
}
|
|
|
|
itemsToWords(items, formatName) {
|
|
const combinedText = combineText(items);
|
|
const words = combinedText.split(' ');
|
|
const format = formatName ? WordFormat.enumValueOf(formatName) : null;
|
|
return words.filter(w => w.trim().length > 0).map(word => {
|
|
var type = null;
|
|
if (word.startsWith('http:')) {
|
|
this.containLinks = true;
|
|
type = WordType.LINK;
|
|
} else if (word.startsWith('www.')) {
|
|
this.containLinks = true;
|
|
word = `http://${word}`
|
|
type = WordType.LINK;
|
|
}
|
|
|
|
if (format) {
|
|
this.formattedWords++;
|
|
}
|
|
return new Word({
|
|
string: word,
|
|
type: type,
|
|
format: format
|
|
});
|
|
});
|
|
}
|
|
}
|
|
|
|
|
|
function combineText(textItems) {
|
|
var text = '';
|
|
var lastItem;
|
|
textItems.forEach(textItem => {
|
|
var textToAdd = textItem.text;
|
|
if (!text.endsWith(' ') && !textToAdd.startsWith(' ')) {
|
|
if (lastItem) {
|
|
const xDistance = textItem.x - lastItem.x - lastItem.width;
|
|
if (xDistance > 5) {
|
|
text += ' ';
|
|
}
|
|
} else {
|
|
if (isListItemCharacter(textItem.text)) {
|
|
textToAdd += ' ';
|
|
}
|
|
}
|
|
}
|
|
text += textToAdd;
|
|
lastItem = textItem;
|
|
});
|
|
return text;
|
|
}
|