mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-18 10:50:47 +01:00
Render footnotes
This commit is contained in:
parent
5daa8aa45a
commit
5db815076b
@ -1,25 +1,28 @@
|
||||
import { groupByBlock, groupByLine } from '../support/groupingUtils';
|
||||
import Item from '../Item';
|
||||
import { Converter } from '../convert';
|
||||
import ChangeTracker from '../debug/ChangeTracker';
|
||||
import EvaluationTracker from '../debug/EvaluationTracker';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import { TextType, headlineLevel } from '../text-types';
|
||||
import {
|
||||
TokenType,
|
||||
attachWithoutWhitespace,
|
||||
endSymbol,
|
||||
plainTextFormat,
|
||||
startSymbol,
|
||||
tokenToText,
|
||||
} from '../token-types';
|
||||
|
||||
export default class MarkdownConverter implements Converter {
|
||||
convert(items: Item[]) {
|
||||
let content = '';
|
||||
|
||||
const lineMerger = new LineItemMerger();
|
||||
const evaluationTracker = new EvaluationTracker();
|
||||
const changeTracker = new ChangeTracker();
|
||||
groupByBlock(items).forEach((blockItems) => {
|
||||
const types = blockItems[0].data['types'] || [];
|
||||
let blockContent = '';
|
||||
groupByLine(blockItems).forEach((lineItems) => {
|
||||
const lineItem = lineMerger.merge(evaluationTracker, changeTracker, ['types'], lineItems);
|
||||
const itemText = lineItem.data['str'];
|
||||
content += elementToText(itemText, types[0]);
|
||||
blockContent += linesToText(lineItems);
|
||||
blockContent += '\n';
|
||||
});
|
||||
content += elementToText(blockContent, types[0]);
|
||||
content += '\n';
|
||||
});
|
||||
|
||||
@ -40,3 +43,64 @@ function elementToText(text: string, type: TextType) {
|
||||
return text + '\n';
|
||||
}
|
||||
}
|
||||
|
||||
function toWords(text: string): string[] {
|
||||
return text.split(' ').filter((string) => string.trim().length > 0);
|
||||
}
|
||||
|
||||
export function linesToText(lineItems: Item[], disableInlineFormats: boolean = false) {
|
||||
let text = '';
|
||||
let openFormat: TokenType;
|
||||
|
||||
const closeFormat = () => {
|
||||
text += endSymbol(openFormat);
|
||||
openFormat = null;
|
||||
};
|
||||
|
||||
lineItems.forEach((lineItem, lineIndex) => {
|
||||
const words = toWords(lineItem.data['str']);
|
||||
words.forEach((word, i) => {
|
||||
const wordType = lineItem.tokenTypes[0]; // footnote, link, etc...
|
||||
const wordFormat = lineItem.tokenTypes[0]; // bold, oblique, etc...
|
||||
if (openFormat && (!wordFormat || wordFormat !== openFormat)) {
|
||||
closeFormat();
|
||||
}
|
||||
if (
|
||||
(i > 0 || lineIndex > 0) &&
|
||||
!(wordType && attachWithoutWhitespace(wordType)) &&
|
||||
!isPunctationCharacter(word)
|
||||
) {
|
||||
text += ' ';
|
||||
}
|
||||
|
||||
if (wordFormat && !openFormat && !disableInlineFormats) {
|
||||
openFormat = wordFormat;
|
||||
text += startSymbol(openFormat);
|
||||
}
|
||||
|
||||
if (wordType && (!disableInlineFormats || plainTextFormat(wordType))) {
|
||||
text += tokenToText(word, wordFormat);
|
||||
} else {
|
||||
text += word;
|
||||
}
|
||||
});
|
||||
if (openFormat && (lineIndex == lineItems.length - 1 || firstFormat(lineItems[lineIndex + 1]) !== openFormat)) {
|
||||
closeFormat();
|
||||
}
|
||||
});
|
||||
return text;
|
||||
}
|
||||
|
||||
function firstFormat(lineItem: Item) {
|
||||
if (lineItem.data['str'].length == 0) {
|
||||
return null;
|
||||
}
|
||||
return lineItem.tokenTypes[0];
|
||||
}
|
||||
|
||||
function isPunctationCharacter(value: string) {
|
||||
if (value.length != 1) {
|
||||
return false;
|
||||
}
|
||||
return value[0] === '.' || value[0] === '!' || value[0] === '?';
|
||||
}
|
||||
|
@ -1,2 +1,57 @@
|
||||
export type FontType = 'BOLD' | 'OBLIQUE';
|
||||
export type TokenType = 'LINK' | 'FOOTNOTE' | 'FOOTNOTE_LINK' | FontType;
|
||||
|
||||
export const startSymbol = (type: TokenType) => {
|
||||
switch (type) {
|
||||
case 'BOLD':
|
||||
return '**';
|
||||
case 'OBLIQUE':
|
||||
return '_';
|
||||
default:
|
||||
return '';
|
||||
// throw new Error(`No start symbol defined for type: ${type}`);
|
||||
}
|
||||
};
|
||||
|
||||
export const endSymbol = (type: TokenType) => {
|
||||
switch (type) {
|
||||
case 'BOLD':
|
||||
return '**';
|
||||
case 'OBLIQUE':
|
||||
return '_';
|
||||
default:
|
||||
return '';
|
||||
// throw new Error(`No end symbol defined for type: ${type}`);
|
||||
}
|
||||
};
|
||||
|
||||
export const attachWithoutWhitespace = (type: TokenType) => {
|
||||
switch (type) {
|
||||
case 'FOOTNOTE_LINK':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
export const plainTextFormat = (type: TokenType) => {
|
||||
switch (type) {
|
||||
case 'FOOTNOTE_LINK':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
export function tokenToText(token: string, type: TokenType) {
|
||||
switch (type) {
|
||||
case 'LINK':
|
||||
return `[${token}](${token})`;
|
||||
case 'FOOTNOTE_LINK':
|
||||
return `[^${token}]`;
|
||||
case 'FOOTNOTE':
|
||||
return `[^${token}]:`;
|
||||
default:
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user