Headline to upper case transformation

* Add testing capability (mocha, chai)
* Add MarkdownElement to text item
This commit is contained in:
Johannes Zillmann 2017-02-05 21:22:42 +01:00
parent 0245ea16f1
commit 1b326a9f36
14 changed files with 238 additions and 24 deletions

View File

@ -18,11 +18,13 @@
"env": { "env": {
"browser": true, "browser": true,
"node": true, "node": true,
"es6": true "es6": true,
"jasmine": true
}, },
// Enable custom plugin known as eslint-plugin-react // Enable custom plugin known as eslint-plugin-react
"plugins": [ "plugins": [
"react" "react",
"jasmine"
], ],
"rules": { "rules": {
// Disable `no-console` rule // Disable `no-console` rule

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
node_modules/ node_modules/
build/ build/
npm-debug.log npm-debug.log
.eslintcache

View File

@ -7,6 +7,7 @@
"watch": "webpack -d --watch", "watch": "webpack -d --watch",
"build": "webpack", "build": "webpack",
"lint": "eslint src --ext .js --ext .jsx --cache", "lint": "eslint src --ext .js --ext .jsx --cache",
"test": "mocha --compilers js:babel-core/register test/*.spec.js",
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
"deploy": "npm run release && cp -r build/* docs/" "deploy": "npm run release && cp -r build/* docs/"
}, },
@ -40,14 +41,17 @@
"babel-preset-es2015": "^6.18.0", "babel-preset-es2015": "^6.18.0",
"babel-preset-react": "^6.16.0", "babel-preset-react": "^6.16.0",
"babel-preset-stage-0": "^6.16.0", "babel-preset-stage-0": "^6.16.0",
"chai": "^3.5.0",
"copy-webpack-plugin": "^4.0.1", "copy-webpack-plugin": "^4.0.1",
"css-loader": "^0.25.0", "css-loader": "^0.25.0",
"esformatter-jsx": "^7.0.1", "esformatter-jsx": "^7.0.1",
"eslint": "^3.7.0", "eslint": "^3.7.0",
"eslint-plugin-jasmine": "^2.2.0",
"eslint-plugin-react": "^6.3.0", "eslint-plugin-react": "^6.3.0",
"extract-text-webpack-plugin": "^1.0.1", "extract-text-webpack-plugin": "^1.0.1",
"file-loader": "^0.9.0", "file-loader": "^0.9.0",
"html-webpack-plugin": "^2.24.1", "html-webpack-plugin": "^2.24.1",
"mocha": "^3.2.0",
"style-loader": "^0.13.1", "style-loader": "^0.13.1",
"url-loader": "^0.5.7", "url-loader": "^0.5.7",
"webpack": "^1.13.3" "webpack": "^1.13.3"

View File

@ -11,3 +11,19 @@ export function isNumber(string) {
} }
return true; return true;
} }
export function hasUpperCaseCharacterInMiddleOfWord(text) {
var beginningOfWord = true;
for (var i = 0; i < text.length; i++) {
const character = text.charAt(i);
if (character === ' ') {
beginningOfWord = true;
} else {
if (!beginningOfWord && isNaN(character * 1) && character == character.toUpperCase() && character.toUpperCase() != character.toLowerCase()) {
return true;
}
beginningOfWord = false;
}
}
return false;
}

View File

@ -6,6 +6,7 @@ import CombineSameY from './transformations/CombineSameY.jsx';
import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx' import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextPages from './transformations/ToTextPages.jsx'; import ToTextPages from './transformations/ToTextPages.jsx';
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx' import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
@ -24,6 +25,7 @@ export default class AppState {
new DetectFootnotes(), new DetectFootnotes(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new HeadlineDetector(), new HeadlineDetector(),
new HeadlineToUppercase(),
new ToTextPages(), new ToTextPages(),
new ToSingleTextPage()]; new ToSingleTextPage()];

View File

@ -8,6 +8,7 @@ export default class TextItem {
this.height = options.height; this.height = options.height;
this.text = options.text; this.text = options.text;
this.annotation = options.annotation; this.annotation = options.annotation;
this.markdownElement = options.markdownElement;
} }
} }

View File

@ -0,0 +1,17 @@
import MarkdownElement from './MarkdownElement.jsx';
export default class Headline extends MarkdownElement {
constructor(options) {
super({
newLineBefore: true,
newLineAfter: true
});
this.level = options.level;
}
transformText(text) {
return '#'.repeat(this.level) + ' ' + text;
}
}

View File

@ -0,0 +1,16 @@
// An text item detected as markdown element
export default class MarkdownElement {
constructor(options) {
if (this.constructor === MarkdownElement) {
throw new TypeError("Can not construct abstract class.");
}
this.newLineBefore = options.newLineBefore;
this.newLineAfter = options.newLineAfter;
}
transformText(text) { // eslint-disable-line no-unused-vars
throw new TypeError("Do not call abstract method foo from child.");
}
}

View File

@ -4,6 +4,8 @@ import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx'; import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx'; import Annotation from '../Annotation.jsx';
import Headline from '../markdown/Headline.jsx';
function analyzeHeigths(pages) { function analyzeHeigths(pages) {
const analyzationResult = { const analyzationResult = {
@ -47,13 +49,13 @@ function analyzeHeigths(pages) {
return analyzationResult; return analyzationResult;
} }
function findNextMajorHeight(heights, currentHeight, headlineMap) { function findNextMajorHeight(heights, currentHeight, headlineLevels) {
for (var i = currentHeight; i < heights.length; i++) { for (var i = currentHeight; i < heights.length; i++) {
if (headlineMap[heights[i]]) { if (headlineLevels[heights[i]]) {
return heights[i]; return heights[i];
} }
} }
throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`; throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineLevels=${headlineLevels}`;
} }
@ -95,26 +97,26 @@ export default class HeadlineDetector extends Transformation {
}); });
const headlineMap = {}; const headlineLevels = {};
headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1)); headlineHeights.reverse().forEach((height, i) => headlineLevels[height] = i + 1);
var lastMajorHeight = paragraphHeight; var lastMajorHeight = paragraphHeight;
var heights = heightAnalyzation.heights; var heights = heightAnalyzation.heights;
for (var i = 0; i < heights.length; i++) { for (var i = 0; i < heights.length; i++) {
if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) { if (heights[i] > paragraphHeight && !headlineLevels[heights[i]]) {
const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap); const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineLevels);
const distanceToLower = heights[i] - lastMajorHeight; const distanceToLower = heights[i] - lastMajorHeight;
const distanceToHigher = nextMajorHeight - heights[i]; const distanceToHigher = nextMajorHeight - heights[i];
if (distanceToLower <= distanceToHigher) { if (distanceToLower <= distanceToHigher) {
if (lastMajorHeight == paragraphHeight) { if (lastMajorHeight == paragraphHeight) {
paragraphHeight++; paragraphHeight++;
} else { } else {
headlineMap[heights[i]] = headlineMap[lastMajorHeight]; headlineLevels[heights[i]] = headlineLevels[lastMajorHeight];
} }
} else { } else {
headlineMap[heights[i]] = headlineMap[nextMajorHeight]; headlineLevels[heights[i]] = headlineLevels[nextMajorHeight];
} }
} }
if (headlineMap[heights[i]]) { if (headlineLevels[heights[i]]) {
lastMajorHeight = heights[i]; lastMajorHeight = heights[i];
} }
} }
@ -125,12 +127,16 @@ export default class HeadlineDetector extends Transformation {
if (item.height <= paragraphHeight) { if (item.height <= paragraphHeight) {
newTextItems.push(item); newTextItems.push(item);
} else { } else {
const headlineLevel = headlineLevels[item.height];
newTextItems.push(new TextItem({ newTextItems.push(new TextItem({
...item, ...item,
text: item.text, text: item.text,
annotation: new Annotation({ annotation: new Annotation({
category: headlineMap[item.height], category: "Headline " + headlineLevel,
color: 'green' color: 'green'
}),
markdownElement: new Headline({
level: headlineLevel
}) })
})); }));
} }
@ -144,11 +150,7 @@ export default class HeadlineDetector extends Transformation {
processAnnotations(pages:PdfPage[]) { processAnnotations(pages:PdfPage[]) {
pages.forEach(page => { pages.forEach(page => {
page.textItems.forEach(item => { page.textItems.forEach(textItem => textItem.annotation = null)
if (item.annotation) {
item.text = item.annotation.category + ' ' + item.text;
}
});
}); });
return pages; return pages;
} }

View File

@ -0,0 +1,69 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
// This tries to detect them and make them all uppercase.
export default class HeadlineToUppercase extends Transformation {
constructor() {
super("Headlines Uppercase");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
return pages.map(page => {
const newTextItems = [];
page.textItems.forEach(item => {
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
const headline = item.text.trim();
if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
item.annotation = new Annotation({
category: 'removed',
color: 'red'
});
newTextItems.push(item);
newTextItems.push(new TextItem({
...item,
text: item.text.toUpperCase(),
annotation: new Annotation({
category: "Uppercased",
color: 'green'
})
}));
} else {
item.annotation = new Annotation({
category: 'Untouched',
color: 'brown'
});
newTextItems.push(item);
}
} else {
newTextItems.push(item);
}
});
return {
...page,
textItems: newTextItems
};
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}

View File

@ -3,6 +3,13 @@ import PdfPage from '../PdfPage.jsx';
import TextPage from '../TextPage.jsx'; import TextPage from '../TextPage.jsx';
import ContentView from '../ContentView.jsx'; import ContentView from '../ContentView.jsx';
function itemIsSameMarkdownElement(item1, item2) {
if (!item1.markdownElement || !item2.markdownElement) {
return false;
}
return item1.markdownElement.constructor.name === item2.markdownElement.constructor.name;
}
export default class ToTextPages extends Transformation { export default class ToTextPages extends Transformation {
constructor() { constructor() {
@ -14,11 +21,23 @@ export default class ToTextPages extends Transformation {
} }
transform(pdfPages:PdfPage[]) { transform(pdfPages:PdfPage[]) {
return pdfPages.map(pdfPage => { return pdfPages.map(page => {
var text = ''; var text = '';
pdfPage.textItems.forEach(textItem => text += textItem.text + '\n'); page.textItems.forEach((textItem, i) => {
if (textItem.markdownElement) {
if (i > 0 && textItem.markdownElement.newLineBefore && !itemIsSameMarkdownElement(textItem, page.textItems[i - 1])) {
text += '\n'
}
text += textItem.markdownElement.transformText(textItem.text) + '\n'
if (textItem.markdownElement.newLineAfter && (i == page.textItems.length - 1 || !itemIsSameMarkdownElement(textItem, page.textItems[i + 1]))) {
text += '\n'
}
} else {
text += textItem.text + '\n'
}
});
return new TextPage({ return new TextPage({
index: pdfPage.index, index: page.index,
text: text text: text
}); });
}); });

27
test/Headline.spec.js Normal file
View File

@ -0,0 +1,27 @@
import { expect } from 'chai';
import Headline from '../src/javascript/models/markdown/Headline';
describe('Headline', () => {
it('correct level 1 props', () => {
const headline = new Headline({
level: 1
});
expect(headline.level).to.equal(1);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('# Hello World');
});
it('correct level 2 props', () => {
const headline = new Headline({
level: 2
});
expect(headline.level).to.equal(2);
expect(headline.newLineBefore).to.equal(true);
expect(headline.newLineAfter).to.equal(true);
expect(headline.transformText('Hello World')).to.equal('## Hello World');
});
});

39
test/functions.spec.js Normal file
View File

@ -0,0 +1,39 @@
import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx'
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
it('single word', () => {
expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("wOrd")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("woRd")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("worD")).to.equal(true);
});
it('multi words', () => {
expect(hasUpperCaseCharacterInMiddleOfWord("Hello World")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("hello world")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("HelloWorld")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("HellO World")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("Hello WOrld")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("Hello WorlD")).to.equal(true);
});
it('with numbers', () => {
expect(hasUpperCaseCharacterInMiddleOfWord("high5")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("High5")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5th")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'sec")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("Type-0-mat")).to.equal(false);
expect(hasUpperCaseCharacterInMiddleOfWord("HigH 5")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5E")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5 or tWo down")).to.equal(true);
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
});
});

View File

@ -15,10 +15,9 @@ module.exports = {
filename: 'bundle.js' filename: 'bundle.js'
}, },
resolve: { resolve: {
extensions: ['', '.js', '.vue'], extensions: ['', '.js'],
fallback: [path.join(__dirname, '../node_modules')], fallback: [path.join(__dirname, '../node_modules')],
alias: { alias: {
'vue$': 'vue/dist/vue',
'src': path.resolve(__dirname, '../src'), 'src': path.resolve(__dirname, '../src'),
'assets': path.resolve(__dirname, '../src/assets'), 'assets': path.resolve(__dirname, '../src/assets'),
'components': path.resolve(__dirname, '../src/components') 'components': path.resolve(__dirname, '../src/components')