mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-01 03:09:01 +01:00
[WIP] stabilized formatting
This commit is contained in:
parent
10cc7cf0ab
commit
e144d6a6d5
@ -55,6 +55,32 @@ export function removeLeadingWhitespaces(string) {
|
||||
return string;
|
||||
}
|
||||
|
||||
export function removeTrailingWhitespaces(string) {
|
||||
while (string.charCodeAt(string.length - 1) === WHITESPACE_CHAR_CODE) {
|
||||
string = string.substring(0, string.length - 1);
|
||||
}
|
||||
return string;
|
||||
}
|
||||
|
||||
|
||||
export function prefixAfterWhitespace(prefix, string) {
|
||||
if (string.charCodeAt(0) == WHITESPACE_CHAR_CODE) {
|
||||
string = removeLeadingWhitespaces(string);
|
||||
return ' ' + prefix + string;
|
||||
} else {
|
||||
return prefix + string;
|
||||
}
|
||||
}
|
||||
|
||||
export function suffixBeforeWhitespace(string, suffix) {
|
||||
if (string.charCodeAt(string.length - 1) == WHITESPACE_CHAR_CODE) {
|
||||
string = removeTrailingWhitespaces(string);
|
||||
return string + suffix + ' ';
|
||||
} else {
|
||||
return string + suffix;
|
||||
}
|
||||
}
|
||||
|
||||
export function isListItem(string) {
|
||||
return /^[\s]*[-•][\s].*$/g.test(string);
|
||||
}
|
||||
|
@ -9,17 +9,17 @@ StringFormat.initEnum({
|
||||
},
|
||||
BOLD: {
|
||||
needFormat: true,
|
||||
startSymbol: '**',
|
||||
endSymbol: '**'
|
||||
startSymbol: ' **',
|
||||
endSymbol: '** '
|
||||
},
|
||||
OBLIQUE: {
|
||||
needFormat: true,
|
||||
startSymbol: '_',
|
||||
endSymbol: '_'
|
||||
startSymbol: ' _',
|
||||
endSymbol: '_ '
|
||||
},
|
||||
BOLD_OBLIQUE: {
|
||||
needFormat: true,
|
||||
startSymbol: '**_',
|
||||
endSymbol: '_**'
|
||||
startSymbol: ' **_',
|
||||
endSymbol: '_** '
|
||||
}
|
||||
})
|
@ -2,6 +2,7 @@ import TextItem from './TextItem.jsx';
|
||||
import { ParsedElements } from './PageItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
import { prefixAfterWhitespace, suffixBeforeWhitespace } from '../functions.jsx';
|
||||
|
||||
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
@ -71,19 +72,19 @@ export default class TextItemLineCompactor {
|
||||
const addStartSymbol = () => {
|
||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||
...openFormatItem,
|
||||
text: openFormatType.startSymbol + openFormatItem.text
|
||||
text: prefixAfterWhitespace(openFormatType.startSymbol, openFormatItem.text)
|
||||
}));
|
||||
}
|
||||
const addEndSymbol = (index) => {
|
||||
resolvedLineItems.splice(index, 1, new TextItem({
|
||||
...lastItem,
|
||||
text: lastItem.text + openFormatType.endSymbol
|
||||
text: suffixBeforeWhitespace(lastItem.text, openFormatType.endSymbol)
|
||||
}));
|
||||
}
|
||||
const addCompleteSymbol = () => {
|
||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||
...openFormatItem,
|
||||
text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
|
||||
text: suffixBeforeWhitespace(prefixAfterWhitespace(openFormatType.startSymbol, openFormatItem.text), openFormatType.endSymbol)
|
||||
}));
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
|
||||
|
||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
|
||||
@ -54,6 +54,43 @@ describe('removeLeadingWhitespaces', () => {
|
||||
|
||||
});
|
||||
|
||||
describe('removeTrailingWhitespaces', () => {
|
||||
it('No Removes', () => {
|
||||
expect(removeTrailingWhitespaces(".")).to.be.equal(".");
|
||||
expect(removeTrailingWhitespaces(" .")).to.be.equal(" .");
|
||||
expect(removeTrailingWhitespaces(" . .")).to.be.equal(" . .");
|
||||
});
|
||||
|
||||
it('Removes', () => {
|
||||
expect(removeTrailingWhitespaces(". ")).to.be.equal(".");
|
||||
expect(removeTrailingWhitespaces(". ")).to.be.equal(".");
|
||||
expect(removeTrailingWhitespaces(" . ")).to.be.equal(" .");
|
||||
expect(removeTrailingWhitespaces(" . . ")).to.be.equal(" . .");
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
describe('prefixAfterWhitespace', () => {
|
||||
it('Basic', () => {
|
||||
expect(prefixAfterWhitespace('1', '2')).to.be.equal('12');
|
||||
expect(prefixAfterWhitespace(' 1', '2')).to.be.equal(' 12');
|
||||
expect(prefixAfterWhitespace(' 1', ' 2')).to.be.equal(' 12');
|
||||
expect(prefixAfterWhitespace('1', ' 2')).to.be.equal(' 12');
|
||||
expect(prefixAfterWhitespace('1', ' 2')).to.be.equal(' 12');
|
||||
});
|
||||
});
|
||||
|
||||
describe('suffixBeforeWhitespace', () => {
|
||||
it('Basic', () => {
|
||||
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||
expect(suffixBeforeWhitespace(' A', '.')).to.be.equal(' A.');
|
||||
expect(suffixBeforeWhitespace(' A ', ' .')).to.be.equal(' A . ');
|
||||
expect(suffixBeforeWhitespace('A', ' .')).to.be.equal('A .');
|
||||
expect(suffixBeforeWhitespace('A ', '.')).to.be.equal('A. ');
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe('charCodeArray', () => {
|
||||
it('Charcodes', () => {
|
||||
|
Loading…
Reference in New Issue
Block a user