mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-23 08:13:30 +01:00
Headline to upper case transformation
* Add testing capability (mocha, chai) * Add MarkdownElement to text item
This commit is contained in:
parent
0245ea16f1
commit
1b326a9f36
@ -18,11 +18,13 @@
|
||||
"env": {
|
||||
"browser": true,
|
||||
"node": true,
|
||||
"es6": true
|
||||
"es6": true,
|
||||
"jasmine": true
|
||||
},
|
||||
// Enable custom plugin known as eslint-plugin-react
|
||||
"plugins": [
|
||||
"react"
|
||||
"react",
|
||||
"jasmine"
|
||||
],
|
||||
"rules": {
|
||||
// Disable `no-console` rule
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
node_modules/
|
||||
build/
|
||||
npm-debug.log
|
||||
.eslintcache
|
@ -7,6 +7,7 @@
|
||||
"watch": "webpack -d --watch",
|
||||
"build": "webpack",
|
||||
"lint": "eslint src --ext .js --ext .jsx --cache",
|
||||
"test": "mocha --compilers js:babel-core/register test/*.spec.js",
|
||||
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
||||
"deploy": "npm run release && cp -r build/* docs/"
|
||||
},
|
||||
@ -40,14 +41,17 @@
|
||||
"babel-preset-es2015": "^6.18.0",
|
||||
"babel-preset-react": "^6.16.0",
|
||||
"babel-preset-stage-0": "^6.16.0",
|
||||
"chai": "^3.5.0",
|
||||
"copy-webpack-plugin": "^4.0.1",
|
||||
"css-loader": "^0.25.0",
|
||||
"esformatter-jsx": "^7.0.1",
|
||||
"eslint": "^3.7.0",
|
||||
"eslint-plugin-jasmine": "^2.2.0",
|
||||
"eslint-plugin-react": "^6.3.0",
|
||||
"extract-text-webpack-plugin": "^1.0.1",
|
||||
"file-loader": "^0.9.0",
|
||||
"html-webpack-plugin": "^2.24.1",
|
||||
"mocha": "^3.2.0",
|
||||
"style-loader": "^0.13.1",
|
||||
"url-loader": "^0.5.7",
|
||||
"webpack": "^1.13.3"
|
||||
|
@ -11,3 +11,19 @@ export function isNumber(string) {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function hasUpperCaseCharacterInMiddleOfWord(text) {
|
||||
var beginningOfWord = true;
|
||||
for (var i = 0; i < text.length; i++) {
|
||||
const character = text.charAt(i);
|
||||
if (character === ' ') {
|
||||
beginningOfWord = true;
|
||||
} else {
|
||||
if (!beginningOfWord && isNaN(character * 1) && character == character.toUpperCase() && character.toUpperCase() != character.toLowerCase()) {
|
||||
return true;
|
||||
}
|
||||
beginningOfWord = false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
||||
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
||||
|
||||
@ -24,6 +25,7 @@ export default class AppState {
|
||||
new DetectFootnotes(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new HeadlineDetector(),
|
||||
new HeadlineToUppercase(),
|
||||
new ToTextPages(),
|
||||
new ToSingleTextPage()];
|
||||
|
||||
|
@ -8,6 +8,7 @@ export default class TextItem {
|
||||
this.height = options.height;
|
||||
this.text = options.text;
|
||||
this.annotation = options.annotation;
|
||||
this.markdownElement = options.markdownElement;
|
||||
}
|
||||
|
||||
}
|
||||
|
17
src/javascript/models/markdown/Headline.jsx
Normal file
17
src/javascript/models/markdown/Headline.jsx
Normal file
@ -0,0 +1,17 @@
|
||||
import MarkdownElement from './MarkdownElement.jsx';
|
||||
|
||||
export default class Headline extends MarkdownElement {
|
||||
|
||||
constructor(options) {
|
||||
super({
|
||||
newLineBefore: true,
|
||||
newLineAfter: true
|
||||
});
|
||||
this.level = options.level;
|
||||
}
|
||||
|
||||
transformText(text) {
|
||||
return '#'.repeat(this.level) + ' ' + text;
|
||||
}
|
||||
|
||||
}
|
16
src/javascript/models/markdown/MarkdownElement.jsx
Normal file
16
src/javascript/models/markdown/MarkdownElement.jsx
Normal file
@ -0,0 +1,16 @@
|
||||
// An text item detected as markdown element
|
||||
export default class MarkdownElement {
|
||||
|
||||
constructor(options) {
|
||||
if (this.constructor === MarkdownElement) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.newLineBefore = options.newLineBefore;
|
||||
this.newLineAfter = options.newLineAfter;
|
||||
}
|
||||
|
||||
transformText(text) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
}
|
@ -4,6 +4,8 @@ import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
import Headline from '../markdown/Headline.jsx';
|
||||
|
||||
|
||||
function analyzeHeigths(pages) {
|
||||
const analyzationResult = {
|
||||
@ -47,13 +49,13 @@ function analyzeHeigths(pages) {
|
||||
return analyzationResult;
|
||||
}
|
||||
|
||||
function findNextMajorHeight(heights, currentHeight, headlineMap) {
|
||||
function findNextMajorHeight(heights, currentHeight, headlineLevels) {
|
||||
for (var i = currentHeight; i < heights.length; i++) {
|
||||
if (headlineMap[heights[i]]) {
|
||||
if (headlineLevels[heights[i]]) {
|
||||
return heights[i];
|
||||
}
|
||||
}
|
||||
throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`;
|
||||
throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineLevels=${headlineLevels}`;
|
||||
}
|
||||
|
||||
|
||||
@ -95,26 +97,26 @@ export default class HeadlineDetector extends Transformation {
|
||||
});
|
||||
|
||||
|
||||
const headlineMap = {};
|
||||
headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1));
|
||||
const headlineLevels = {};
|
||||
headlineHeights.reverse().forEach((height, i) => headlineLevels[height] = i + 1);
|
||||
var lastMajorHeight = paragraphHeight;
|
||||
var heights = heightAnalyzation.heights;
|
||||
for (var i = 0; i < heights.length; i++) {
|
||||
if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) {
|
||||
const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap);
|
||||
if (heights[i] > paragraphHeight && !headlineLevels[heights[i]]) {
|
||||
const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineLevels);
|
||||
const distanceToLower = heights[i] - lastMajorHeight;
|
||||
const distanceToHigher = nextMajorHeight - heights[i];
|
||||
if (distanceToLower <= distanceToHigher) {
|
||||
if (lastMajorHeight == paragraphHeight) {
|
||||
paragraphHeight++;
|
||||
} else {
|
||||
headlineMap[heights[i]] = headlineMap[lastMajorHeight];
|
||||
headlineLevels[heights[i]] = headlineLevels[lastMajorHeight];
|
||||
}
|
||||
} else {
|
||||
headlineMap[heights[i]] = headlineMap[nextMajorHeight];
|
||||
headlineLevels[heights[i]] = headlineLevels[nextMajorHeight];
|
||||
}
|
||||
}
|
||||
if (headlineMap[heights[i]]) {
|
||||
if (headlineLevels[heights[i]]) {
|
||||
lastMajorHeight = heights[i];
|
||||
}
|
||||
}
|
||||
@ -125,12 +127,16 @@ export default class HeadlineDetector extends Transformation {
|
||||
if (item.height <= paragraphHeight) {
|
||||
newTextItems.push(item);
|
||||
} else {
|
||||
const headlineLevel = headlineLevels[item.height];
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: item.text,
|
||||
annotation: new Annotation({
|
||||
category: headlineMap[item.height],
|
||||
category: "Headline " + headlineLevel,
|
||||
color: 'green'
|
||||
}),
|
||||
markdownElement: new Headline({
|
||||
level: headlineLevel
|
||||
})
|
||||
}));
|
||||
}
|
||||
@ -144,11 +150,7 @@ export default class HeadlineDetector extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems.forEach(item => {
|
||||
if (item.annotation) {
|
||||
item.text = item.annotation.category + ' ' + item.text;
|
||||
}
|
||||
});
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
@ -0,0 +1,69 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
||||
|
||||
// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
|
||||
// This tries to detect them and make them all uppercase.
|
||||
export default class HeadlineToUppercase extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Headlines Uppercase");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
|
||||
return pages.map(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
|
||||
const headline = item.text.trim();
|
||||
if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
|
||||
item.annotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: item.text.toUpperCase(),
|
||||
annotation: new Annotation({
|
||||
category: "Uppercased",
|
||||
color: 'green'
|
||||
})
|
||||
}));
|
||||
} else {
|
||||
item.annotation = new Annotation({
|
||||
category: 'Untouched',
|
||||
color: 'brown'
|
||||
});
|
||||
newTextItems.push(item);
|
||||
}
|
||||
} else {
|
||||
newTextItems.push(item);
|
||||
}
|
||||
});
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
@ -3,6 +3,13 @@ import PdfPage from '../PdfPage.jsx';
|
||||
import TextPage from '../TextPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
function itemIsSameMarkdownElement(item1, item2) {
|
||||
if (!item1.markdownElement || !item2.markdownElement) {
|
||||
return false;
|
||||
}
|
||||
return item1.markdownElement.constructor.name === item2.markdownElement.constructor.name;
|
||||
}
|
||||
|
||||
export default class ToTextPages extends Transformation {
|
||||
|
||||
constructor() {
|
||||
@ -14,11 +21,23 @@ export default class ToTextPages extends Transformation {
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages.map(pdfPage => {
|
||||
return pdfPages.map(page => {
|
||||
var text = '';
|
||||
pdfPage.textItems.forEach(textItem => text += textItem.text + '\n');
|
||||
page.textItems.forEach((textItem, i) => {
|
||||
if (textItem.markdownElement) {
|
||||
if (i > 0 && textItem.markdownElement.newLineBefore && !itemIsSameMarkdownElement(textItem, page.textItems[i - 1])) {
|
||||
text += '\n'
|
||||
}
|
||||
text += textItem.markdownElement.transformText(textItem.text) + '\n'
|
||||
if (textItem.markdownElement.newLineAfter && (i == page.textItems.length - 1 || !itemIsSameMarkdownElement(textItem, page.textItems[i + 1]))) {
|
||||
text += '\n'
|
||||
}
|
||||
} else {
|
||||
text += textItem.text + '\n'
|
||||
}
|
||||
});
|
||||
return new TextPage({
|
||||
index: pdfPage.index,
|
||||
index: page.index,
|
||||
text: text
|
||||
});
|
||||
});
|
||||
|
27
test/Headline.spec.js
Normal file
27
test/Headline.spec.js
Normal file
@ -0,0 +1,27 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import Headline from '../src/javascript/models/markdown/Headline';
|
||||
|
||||
describe('Headline', () => {
|
||||
|
||||
it('correct level 1 props', () => {
|
||||
const headline = new Headline({
|
||||
level: 1
|
||||
});
|
||||
expect(headline.level).to.equal(1);
|
||||
expect(headline.newLineBefore).to.equal(true);
|
||||
expect(headline.newLineAfter).to.equal(true);
|
||||
expect(headline.transformText('Hello World')).to.equal('# Hello World');
|
||||
});
|
||||
|
||||
it('correct level 2 props', () => {
|
||||
const headline = new Headline({
|
||||
level: 2
|
||||
});
|
||||
expect(headline.level).to.equal(2);
|
||||
expect(headline.newLineBefore).to.equal(true);
|
||||
expect(headline.newLineAfter).to.equal(true);
|
||||
expect(headline.transformText('Hello World')).to.equal('## Hello World');
|
||||
});
|
||||
|
||||
});
|
39
test/functions.spec.js
Normal file
39
test/functions.spec.js
Normal file
@ -0,0 +1,39 @@
|
||||
import { expect } from 'chai';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx'
|
||||
|
||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||
|
||||
it('single word', () => {
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
|
||||
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("wOrd")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("woRd")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("worD")).to.equal(true);
|
||||
});
|
||||
|
||||
it('multi words', () => {
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Hello World")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("hello world")).to.equal(false);
|
||||
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("HelloWorld")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("HellO World")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Hello WOrld")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Hello WorlD")).to.equal(true);
|
||||
});
|
||||
|
||||
it('with numbers', () => {
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("high5")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High5")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5th")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'sec")).to.equal(false);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("Type-0-mat")).to.equal(false);
|
||||
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("HigH 5")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5E")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5 or tWo down")).to.equal(true);
|
||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
|
||||
});
|
||||
});
|
@ -15,10 +15,9 @@ module.exports = {
|
||||
filename: 'bundle.js'
|
||||
},
|
||||
resolve: {
|
||||
extensions: ['', '.js', '.vue'],
|
||||
extensions: ['', '.js'],
|
||||
fallback: [path.join(__dirname, '../node_modules')],
|
||||
alias: {
|
||||
'vue$': 'vue/dist/vue',
|
||||
'src': path.resolve(__dirname, '../src'),
|
||||
'assets': path.resolve(__dirname, '../src/assets'),
|
||||
'components': path.resolve(__dirname, '../src/components')
|
||||
|
Loading…
Reference in New Issue
Block a user