mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
[WIP] stabilized formatting
This commit is contained in:
parent
10cc7cf0ab
commit
e144d6a6d5
@ -55,6 +55,32 @@ export function removeLeadingWhitespaces(string) {
|
|||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function removeTrailingWhitespaces(string) {
|
||||||
|
while (string.charCodeAt(string.length - 1) === WHITESPACE_CHAR_CODE) {
|
||||||
|
string = string.substring(0, string.length - 1);
|
||||||
|
}
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export function prefixAfterWhitespace(prefix, string) {
|
||||||
|
if (string.charCodeAt(0) == WHITESPACE_CHAR_CODE) {
|
||||||
|
string = removeLeadingWhitespaces(string);
|
||||||
|
return ' ' + prefix + string;
|
||||||
|
} else {
|
||||||
|
return prefix + string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function suffixBeforeWhitespace(string, suffix) {
|
||||||
|
if (string.charCodeAt(string.length - 1) == WHITESPACE_CHAR_CODE) {
|
||||||
|
string = removeTrailingWhitespaces(string);
|
||||||
|
return string + suffix + ' ';
|
||||||
|
} else {
|
||||||
|
return string + suffix;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export function isListItem(string) {
|
export function isListItem(string) {
|
||||||
return /^[\s]*[-•][\s].*$/g.test(string);
|
return /^[\s]*[-•][\s].*$/g.test(string);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ import TextItem from './TextItem.jsx';
|
|||||||
import { ParsedElements } from './PageItem.jsx';
|
import { ParsedElements } from './PageItem.jsx';
|
||||||
import { isNumber } from '../functions.jsx'
|
import { isNumber } from '../functions.jsx'
|
||||||
import { sortByX } from '../textItemFunctions.jsx'
|
import { sortByX } from '../textItemFunctions.jsx'
|
||||||
|
import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
|
||||||
|
|
||||||
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
||||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||||
@ -71,19 +72,19 @@ export default class TextItemLineCompactor {
|
|||||||
const addStartSymbol = () => {
|
const addStartSymbol = () => {
|
||||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||||
...openFormatItem,
|
...openFormatItem,
|
||||||
text: openFormatType.startSymbol + openFormatItem.text
|
text: prefixAfterWhitespace(openFormatType.startSymbol, openFormatItem.text)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
const addEndSymbol = (index) => {
|
const addEndSymbol = (index) => {
|
||||||
resolvedLineItems.splice(index, 1, new TextItem({
|
resolvedLineItems.splice(index, 1, new TextItem({
|
||||||
...lastItem,
|
...lastItem,
|
||||||
text: lastItem.text + openFormatType.endSymbol
|
text: suffixBeforeWhitespace(lastItem.text, openFormatType.endSymbol)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
const addCompleteSymbol = () => {
|
const addCompleteSymbol = () => {
|
||||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||||
...openFormatItem,
|
...openFormatItem,
|
||||||
text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
|
text: suffixBeforeWhitespace(prefixAfterWhitespace(openFormatType.startSymbol, openFormatItem.text), openFormatType.endSymbol)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { expect } from 'chai';
|
import { expect } from 'chai';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||||
|
|
||||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||||
|
|
||||||
@ -54,6 +54,43 @@ describe('removeLeadingWhitespaces', () => {
|
|||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('removeTrailingWhitespaces', () => {
|
||||||
|
it('No Removes', () => {
|
||||||
|
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
|
||||||
|
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
|
||||||
|
expect(removeTrailingWhitespaces(" . .")).to.be.equal(" . .");
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Removes', () => {
|
||||||
|
expect(removeTrailingWhitespaces(". ")).to.be.equal(".");
|
||||||
|
expect(removeTrailingWhitespaces(". ")).to.be.equal(".");
|
||||||
|
expect(removeTrailingWhitespaces(" . ")).to.be.equal(" .");
|
||||||
|
expect(removeTrailingWhitespaces(" . . ")).to.be.equal(" . .");
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
describe('prefixAfterWhitespace', () => {
|
||||||
|
it('Basic', () => {
|
||||||
|
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
|
||||||
|
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
|
||||||
|
expect(prefixAfterWhitespace(' 1', ' 2')).to.be.equal(' 12');
|
||||||
|
expect(prefixAfterWhitespace('1', ' 2')).to.be.equal(' 12');
|
||||||
|
expect(prefixAfterWhitespace('1', ' 2')).to.be.equal(' 12');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('suffixBeforeWhitespace', () => {
|
||||||
|
it('Basic', () => {
|
||||||
|
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||||
|
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
|
||||||
|
expect(suffixBeforeWhitespace(' A ', ' .')).to.be.equal(' A . ');
|
||||||
|
expect(suffixBeforeWhitespace('A', ' .')).to.be.equal('A .');
|
||||||
|
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
describe('charCodeArray', () => {
|
describe('charCodeArray', () => {
|
||||||
it('Charcodes', () => {
|
it('Charcodes', () => {
|
||||||
|
Loading…
Reference in New Issue
Block a user