From 1b326a9f36c3f3bf295dc5d8de50739ca90b3b7c Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sun, 5 Feb 2017 21:22:42 +0100 Subject: [PATCH] Headline to upper case transformation * Add testing capability (mocha, chai) * Add MarkdownElement to text item --- .eslintrc | 6 +- .gitignore | 1 + package.json | 4 ++ src/javascript/functions.jsx | 18 ++++- src/javascript/models/AppState.jsx | 2 + src/javascript/models/TextItem.jsx | 1 + src/javascript/models/markdown/Headline.jsx | 17 +++++ .../models/markdown/MarkdownElement.jsx | 16 +++++ .../transformations/HeadlineDetector.jsx | 34 ++++----- .../transformations/HeadlineToUppercase.jsx | 69 +++++++++++++++++++ .../models/transformations/ToTextPages.jsx | 25 ++++++- test/Headline.spec.js | 27 ++++++++ test/functions.spec.js | 39 +++++++++++ webpack.config.js | 3 +- 14 files changed, 238 insertions(+), 24 deletions(-) create mode 100644 src/javascript/models/markdown/Headline.jsx create mode 100644 src/javascript/models/markdown/MarkdownElement.jsx create mode 100644 src/javascript/models/transformations/HeadlineToUppercase.jsx create mode 100644 test/Headline.spec.js create mode 100644 test/functions.spec.js diff --git a/.eslintrc b/.eslintrc index c62b9e8..7a9fc2f 100644 --- a/.eslintrc +++ b/.eslintrc @@ -18,11 +18,13 @@ "env": { "browser": true, "node": true, - "es6": true + "es6": true, + "jasmine": true }, // Enable custom plugin known as eslint-plugin-react "plugins": [ - "react" + "react", + "jasmine" ], "rules": { // Disable `no-console` rule diff --git a/.gitignore b/.gitignore index 23b8d3d..025a419 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules/ build/ npm-debug.log +.eslintcache \ No newline at end of file diff --git a/package.json b/package.json index 259b9c0..6924178 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "watch": "webpack -d --watch", "build": "webpack", "lint": "eslint src --ext .js --ext .jsx --cache", + "test": "mocha --compilers js:babel-core/register test/*.spec.js", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "deploy": "npm run release && cp -r build/* docs/" }, @@ -40,14 +41,17 @@ "babel-preset-es2015": "^6.18.0", "babel-preset-react": "^6.16.0", "babel-preset-stage-0": "^6.16.0", + "chai": "^3.5.0", "copy-webpack-plugin": "^4.0.1", "css-loader": "^0.25.0", "esformatter-jsx": "^7.0.1", "eslint": "^3.7.0", + "eslint-plugin-jasmine": "^2.2.0", "eslint-plugin-react": "^6.3.0", "extract-text-webpack-plugin": "^1.0.1", "file-loader": "^0.9.0", "html-webpack-plugin": "^2.24.1", + "mocha": "^3.2.0", "style-loader": "^0.13.1", "url-loader": "^0.5.7", "webpack": "^1.13.3" diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx index bb19aee..4e24e42 100644 --- a/src/javascript/functions.jsx +++ b/src/javascript/functions.jsx @@ -10,4 +10,20 @@ export function isNumber(string) { } } return true; -} \ No newline at end of file +} + +export function hasUpperCaseCharacterInMiddleOfWord(text) { + var beginningOfWord = true; + for (var i = 0; i < text.length; i++) { + const character = text.charAt(i); + if (character === ' ') { + beginningOfWord = true; + } else { + if (!beginningOfWord && isNaN(character * 1) && character == character.toUpperCase() && character.toUpperCase() != character.toLowerCase()) { + return true; + } + beginningOfWord = false; + } + } + return false; +} diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 6e65abe..e551823 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -6,6 +6,7 @@ import CombineSameY from './transformations/CombineSameY.jsx'; import DetectFootnotes from './transformations/DetectFootnotes.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import HeadlineDetector from './transformations/HeadlineDetector.jsx' +import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' import ToTextPages from './transformations/ToTextPages.jsx'; import ToSingleTextPage from './transformations/ToSingleTextPage.jsx' @@ -24,6 +25,7 @@ export default class AppState { new DetectFootnotes(), new RemoveRepetitiveElements(), new HeadlineDetector(), + new HeadlineToUppercase(), new ToTextPages(), new ToSingleTextPage()]; diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx index c1765d4..1c5400b 100644 --- a/src/javascript/models/TextItem.jsx +++ b/src/javascript/models/TextItem.jsx @@ -8,6 +8,7 @@ export default class TextItem { this.height = options.height; this.text = options.text; this.annotation = options.annotation; + this.markdownElement = options.markdownElement; } } diff --git a/src/javascript/models/markdown/Headline.jsx b/src/javascript/models/markdown/Headline.jsx new file mode 100644 index 0000000..c633a23 --- /dev/null +++ b/src/javascript/models/markdown/Headline.jsx @@ -0,0 +1,17 @@ +import MarkdownElement from './MarkdownElement.jsx'; + +export default class Headline extends MarkdownElement { + + constructor(options) { + super({ + newLineBefore: true, + newLineAfter: true + }); + this.level = options.level; + } + + transformText(text) { + return '#'.repeat(this.level) + ' ' + text; + } + +} diff --git a/src/javascript/models/markdown/MarkdownElement.jsx b/src/javascript/models/markdown/MarkdownElement.jsx new file mode 100644 index 0000000..c860e7d --- /dev/null +++ b/src/javascript/models/markdown/MarkdownElement.jsx @@ -0,0 +1,16 @@ +// An text item detected as markdown element +export default class MarkdownElement { + + constructor(options) { + if (this.constructor === MarkdownElement) { + throw new TypeError("Can not construct abstract class."); + } + this.newLineBefore = options.newLineBefore; + this.newLineAfter = options.newLineAfter; + } + + transformText(text) { // eslint-disable-line no-unused-vars + throw new TypeError("Do not call abstract method foo from child."); + } + +} diff --git a/src/javascript/models/transformations/HeadlineDetector.jsx b/src/javascript/models/transformations/HeadlineDetector.jsx index bd05808..65591c7 100644 --- a/src/javascript/models/transformations/HeadlineDetector.jsx +++ b/src/javascript/models/transformations/HeadlineDetector.jsx @@ -4,6 +4,8 @@ import PdfPage from '../PdfPage.jsx'; import ContentView from '../ContentView.jsx'; import Annotation from '../Annotation.jsx'; +import Headline from '../markdown/Headline.jsx'; + function analyzeHeigths(pages) { const analyzationResult = { @@ -47,13 +49,13 @@ function analyzeHeigths(pages) { return analyzationResult; } -function findNextMajorHeight(heights, currentHeight, headlineMap) { +function findNextMajorHeight(heights, currentHeight, headlineLevels) { for (var i = currentHeight; i < heights.length; i++) { - if (headlineMap[heights[i]]) { + if (headlineLevels[heights[i]]) { return heights[i]; } } - throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`; + throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineLevels=${headlineLevels}`; } @@ -95,26 +97,26 @@ export default class HeadlineDetector extends Transformation { }); - const headlineMap = {}; - headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1)); + const headlineLevels = {}; + headlineHeights.reverse().forEach((height, i) => headlineLevels[height] = i + 1); var lastMajorHeight = paragraphHeight; var heights = heightAnalyzation.heights; for (var i = 0; i < heights.length; i++) { - if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) { - const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap); + if (heights[i] > paragraphHeight && !headlineLevels[heights[i]]) { + const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineLevels); const distanceToLower = heights[i] - lastMajorHeight; const distanceToHigher = nextMajorHeight - heights[i]; if (distanceToLower <= distanceToHigher) { if (lastMajorHeight == paragraphHeight) { paragraphHeight++; } else { - headlineMap[heights[i]] = headlineMap[lastMajorHeight]; + headlineLevels[heights[i]] = headlineLevels[lastMajorHeight]; } } else { - headlineMap[heights[i]] = headlineMap[nextMajorHeight]; + headlineLevels[heights[i]] = headlineLevels[nextMajorHeight]; } } - if (headlineMap[heights[i]]) { + if (headlineLevels[heights[i]]) { lastMajorHeight = heights[i]; } } @@ -125,12 +127,16 @@ export default class HeadlineDetector extends Transformation { if (item.height <= paragraphHeight) { newTextItems.push(item); } else { + const headlineLevel = headlineLevels[item.height]; newTextItems.push(new TextItem({ ...item, text: item.text, annotation: new Annotation({ - category: headlineMap[item.height], + category: "Headline " + headlineLevel, color: 'green' + }), + markdownElement: new Headline({ + level: headlineLevel }) })); } @@ -144,11 +150,7 @@ export default class HeadlineDetector extends Transformation { processAnnotations(pages:PdfPage[]) { pages.forEach(page => { - page.textItems.forEach(item => { - if (item.annotation) { - item.text = item.annotation.category + ' ' + item.text; - } - }); + page.textItems.forEach(textItem => textItem.annotation = null) }); return pages; } diff --git a/src/javascript/models/transformations/HeadlineToUppercase.jsx b/src/javascript/models/transformations/HeadlineToUppercase.jsx new file mode 100644 index 0000000..5d4f8b4 --- /dev/null +++ b/src/javascript/models/transformations/HeadlineToUppercase.jsx @@ -0,0 +1,69 @@ +import Transformation from './Transformation.jsx'; +import TextItem from '../TextItem.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; +import Annotation from '../Annotation.jsx'; + +import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx' + +// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'. +// This tries to detect them and make them all uppercase. +export default class HeadlineToUppercase extends Transformation { + + constructor() { + super("Headlines Uppercase"); + } + + contentView() { + return ContentView.PDF; + } + + transform(pages:PdfPage[]) { + + + return pages.map(page => { + const newTextItems = []; + page.textItems.forEach(item => { + if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') { + const headline = item.text.trim(); + if (hasUpperCaseCharacterInMiddleOfWord(headline)) { + item.annotation = new Annotation({ + category: 'removed', + color: 'red' + }); + newTextItems.push(item); + newTextItems.push(new TextItem({ + ...item, + text: item.text.toUpperCase(), + annotation: new Annotation({ + category: "Uppercased", + color: 'green' + }) + })); + } else { + item.annotation = new Annotation({ + category: 'Untouched', + color: 'brown' + }); + newTextItems.push(item); + } + } else { + newTextItems.push(item); + } + }); + return { + ...page, + textItems: newTextItems + }; + }); + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed'); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return pages; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/ToTextPages.jsx b/src/javascript/models/transformations/ToTextPages.jsx index a3baeac..7c35436 100644 --- a/src/javascript/models/transformations/ToTextPages.jsx +++ b/src/javascript/models/transformations/ToTextPages.jsx @@ -3,6 +3,13 @@ import PdfPage from '../PdfPage.jsx'; import TextPage from '../TextPage.jsx'; import ContentView from '../ContentView.jsx'; +function itemIsSameMarkdownElement(item1, item2) { + if (!item1.markdownElement || !item2.markdownElement) { + return false; + } + return item1.markdownElement.constructor.name === item2.markdownElement.constructor.name; +} + export default class ToTextPages extends Transformation { constructor() { @@ -14,11 +21,23 @@ export default class ToTextPages extends Transformation { } transform(pdfPages:PdfPage[]) { - return pdfPages.map(pdfPage => { + return pdfPages.map(page => { var text = ''; - pdfPage.textItems.forEach(textItem => text += textItem.text + '\n'); + page.textItems.forEach((textItem, i) => { + if (textItem.markdownElement) { + if (i > 0 && textItem.markdownElement.newLineBefore && !itemIsSameMarkdownElement(textItem, page.textItems[i - 1])) { + text += '\n' + } + text += textItem.markdownElement.transformText(textItem.text) + '\n' + if (textItem.markdownElement.newLineAfter && (i == page.textItems.length - 1 || !itemIsSameMarkdownElement(textItem, page.textItems[i + 1]))) { + text += '\n' + } + } else { + text += textItem.text + '\n' + } + }); return new TextPage({ - index: pdfPage.index, + index: page.index, text: text }); }); diff --git a/test/Headline.spec.js b/test/Headline.spec.js new file mode 100644 index 0000000..3195def --- /dev/null +++ b/test/Headline.spec.js @@ -0,0 +1,27 @@ +import { expect } from 'chai'; + +import Headline from '../src/javascript/models/markdown/Headline'; + +describe('Headline', () => { + + it('correct level 1 props', () => { + const headline = new Headline({ + level: 1 + }); + expect(headline.level).to.equal(1); + expect(headline.newLineBefore).to.equal(true); + expect(headline.newLineAfter).to.equal(true); + expect(headline.transformText('Hello World')).to.equal('# Hello World'); + }); + + it('correct level 2 props', () => { + const headline = new Headline({ + level: 2 + }); + expect(headline.level).to.equal(2); + expect(headline.newLineBefore).to.equal(true); + expect(headline.newLineAfter).to.equal(true); + expect(headline.transformText('Hello World')).to.equal('## Hello World'); + }); + +}); diff --git a/test/functions.spec.js b/test/functions.spec.js new file mode 100644 index 0000000..46a1433 --- /dev/null +++ b/test/functions.spec.js @@ -0,0 +1,39 @@ +import { expect } from 'chai'; + +import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx' + +describe('hasUpperCaseCharacterInMiddleOfWord', () => { + + it('single word', () => { + expect(hasUpperCaseCharacterInMiddleOfWord("word")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("Word")).to.equal(false); + + expect(hasUpperCaseCharacterInMiddleOfWord("wOrd")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("woRd")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("worD")).to.equal(true); + }); + + it('multi words', () => { + expect(hasUpperCaseCharacterInMiddleOfWord("Hello World")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("hello world")).to.equal(false); + + expect(hasUpperCaseCharacterInMiddleOfWord("HelloWorld")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("HellO World")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("Hello WOrld")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("Hello WorlD")).to.equal(true); + }); + + it('with numbers', () => { + expect(hasUpperCaseCharacterInMiddleOfWord("high5")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("High5")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5th")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5'sec")).to.equal(false); + expect(hasUpperCaseCharacterInMiddleOfWord("Type-0-mat")).to.equal(false); + + expect(hasUpperCaseCharacterInMiddleOfWord("HigH 5")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5E")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5 or tWo down")).to.equal(true); + expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true); + }); +}); diff --git a/webpack.config.js b/webpack.config.js index 2f89db1..1dc8b32 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -15,10 +15,9 @@ module.exports = { filename: 'bundle.js' }, resolve: { - extensions: ['', '.js', '.vue'], + extensions: ['', '.js'], fallback: [path.join(__dirname, '../node_modules')], alias: { - 'vue$': 'vue/dist/vue', 'src': path.resolve(__dirname, '../src'), 'assets': path.resolve(__dirname, '../src/assets'), 'components': path.resolve(__dirname, '../src/components')