From 46a965785a487746d145a32cdd28906cb1c55641 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Thu, 30 Mar 2017 07:40:21 +0200 Subject: [PATCH] Cleanups & readme * Move line-item transformations to own package * Have WordFormat names instead of whole enum in globals * Rename PdfUploadView to UploadView * Correct license --- README.md | 32 +++++++++++ package.json | 3 +- src/javascript/components/App.jsx | 4 +- src/javascript/components/DebugView.jsx | 57 +++++++++---------- .../{PdfUploadView.jsx => UploadView.jsx} | 4 +- src/javascript/models/AppState.jsx | 12 ++-- src/javascript/models/LineConverter.jsx | 4 +- .../ToLineItemBlockTransformation.jsx | 4 -- .../ToLineItemTransformation.jsx | 4 -- .../ToTextItemTransformation.jsx | 4 -- .../models/transformations/Transformation.jsx | 4 -- .../{textitem => lineitem}/CompactLines.jsx | 0 .../{textitem => lineitem}/DetectHeaders.jsx | 0 .../DetectListItems.jsx | 0 .../{textitem => lineitem}/DetectTOC.jsx | 0 .../RemoveRepetitiveElements.jsx | 0 .../VerticalToHorizontal.jsx | 0 .../textitem/CalculateGlobalStats.jsx | 2 +- 18 files changed, 76 insertions(+), 58 deletions(-) create mode 100644 README.md rename src/javascript/components/{PdfUploadView.jsx => UploadView.jsx} (86%) rename src/javascript/models/transformations/{textitem => lineitem}/CompactLines.jsx (100%) rename src/javascript/models/transformations/{textitem => lineitem}/DetectHeaders.jsx (100%) rename src/javascript/models/transformations/{textitem => lineitem}/DetectListItems.jsx (100%) rename src/javascript/models/transformations/{textitem => lineitem}/DetectTOC.jsx (100%) rename src/javascript/models/transformations/{textitem => lineitem}/RemoveRepetitiveElements.jsx (100%) rename src/javascript/models/transformations/{textitem => lineitem}/VerticalToHorizontal.jsx (100%) diff --git a/README.md b/README.md new file mode 100644 index 0000000..3b17d12 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# PDF-To-Markdown Converter + +Javascript tool to parse PDF files and convert them into Markdwon format. Online version at http://pdf2md.morethan.io! + +## Major Changes + +- **Apr 2017** - 0.1: Initial Release + +## Contribute + +Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)! + +#### Useful Build Commands + +- ```npm install``` Download all necessary npm packages +- ```npm run lint``` Lint the javascript files +- ```npm run test``` Run tests +- ```npm run check``` Lint $ Test +- ```npm run watch``` Continuously build the project +- ```open build/index.html``` Open the build project in your default browser +- ```npm run release``` Build production version +- ```npm run deploy``` Build production version & move it to the github pages fodler + +#### Release +- Increase version in package.json +- ```npm run deploy``` +- commit & push + + +## Credits + +[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser \ No newline at end of file diff --git a/package.json b/package.json index 99271ee..c55cb60 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "build": "webpack", "lint": "eslint src --ext .js --ext .jsx --cache", "test": "mocha --compilers js:babel-core/register test --recursive", + "check": "npm run lint && npm run test", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "deploy": "npm run release && cp -r build/* docs/" }, @@ -17,7 +18,7 @@ "Converter" ], "author": "Johannes Zillmann", - "license": "Apache-2.0", + "license": "AGPL-3.0", "repository": { "type": "git", "url": "https://github.com/jzillmann/pdf-to-markdown" diff --git a/src/javascript/components/App.jsx b/src/javascript/components/App.jsx index 304a86f..b974f22 100644 --- a/src/javascript/components/App.jsx +++ b/src/javascript/components/App.jsx @@ -5,7 +5,7 @@ import Grid from 'react-bootstrap/lib/Grid' import TopBar from './TopBar.jsx'; import FooterBar from './FooterBar.jsx' import { View } from '../models/AppState.jsx'; -import PdfUploadView from './PdfUploadView.jsx'; +import UploadView from './UploadView.jsx'; import LoadingView from './LoadingView.jsx'; import ResultView from './ResultView.jsx'; import DebugView from './DebugView.jsx'; @@ -23,7 +23,7 @@ export default class App extends React.Component { var mainView; switch (this.props.appState.mainView) { case View.UPLOAD: - mainView = + mainView = break; case View.LOADING: mainView = diff --git a/src/javascript/components/DebugView.jsx b/src/javascript/components/DebugView.jsx index 31399ab..ba34780 100644 --- a/src/javascript/components/DebugView.jsx +++ b/src/javascript/components/DebugView.jsx @@ -116,37 +116,36 @@ export default class DebugView extends React.Component { return (
- + - { lastTransformation.showPageSelection() && - - - - } + + + +
-
- - -
-
- -
+
+ + +
+
+ +
diff --git a/src/javascript/components/PdfUploadView.jsx b/src/javascript/components/UploadView.jsx similarity index 86% rename from src/javascript/components/PdfUploadView.jsx rename to src/javascript/components/UploadView.jsx index 9102083..ead10f1 100644 --- a/src/javascript/components/PdfUploadView.jsx +++ b/src/javascript/components/UploadView.jsx @@ -5,7 +5,7 @@ import Alert from 'react-bootstrap/lib/Alert' import Dropzone from 'react-dropzone' import FaCloudUpload from 'react-icons/lib/fa/cloud-upload' -export default class PdfUploadView extends React.Component { +export default class UploadView extends React.Component { static propTypes = { uploadPdfFunction: React.PropTypes.func.isRequired, @@ -42,7 +42,7 @@ export default class PdfUploadView extends React.Component {


- This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only. + This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. No matter how good the parser works for your PDF, you will have to invest a good amount of manuell work to complete it. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 178e35c..98ac607 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -2,12 +2,12 @@ import { Enum } from 'enumify'; import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx'; -import CompactLines from './transformations/textitem/CompactLines.jsx'; -import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' -import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; -import DetectTOC from './transformations/textitem/DetectTOC.jsx' -import DetectListItems from './transformations/textitem/DetectListItems.jsx' -import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' +import CompactLines from './transformations/lineitem/CompactLines.jsx'; +import RemoveRepetitiveElements from './transformations/lineitem/RemoveRepetitiveElements.jsx' +import VerticalToHorizontal from './transformations/lineitem/VerticalToHorizontal.jsx'; +import DetectTOC from './transformations/lineitem/DetectTOC.jsx' +import DetectListItems from './transformations/lineitem/DetectListItems.jsx' +import DetectHeaders from './transformations/lineitem/DetectHeaders.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx index bb7160a..cec894c 100644 --- a/src/javascript/models/LineConverter.jsx +++ b/src/javascript/models/LineConverter.jsx @@ -1,6 +1,7 @@ import TextItem from './TextItem.jsx'; import Word from './Word.jsx'; import WordType from './markdown/WordType.jsx'; +import WordFormat from './markdown/WordFormat.jsx'; import LineItem from './LineItem.jsx'; import StashingStream from './StashingStream.jsx'; import { ParsedElements } from './PageItem.jsx'; @@ -115,9 +116,10 @@ class WordDetectionStream extends StashingStream { results.push(...this.itemsToWords(stash, format)); } - itemsToWords(items, format) { + itemsToWords(items, formatName) { const combinedText = combineText(items); const words = combinedText.split(' '); + const format = formatName ? WordFormat.enumValueOf(formatName) : null; return words.filter(w => w.trim().length > 0).map(word => { var type = null; if (word.startsWith('http:')) { diff --git a/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx index 92e2524..7a90abf 100644 --- a/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx +++ b/src/javascript/models/transformations/ToLineItemBlockTransformation.jsx @@ -16,10 +16,6 @@ export default class ToLineItemBlockTransformation extends Transformation { this.showWhitespaces = false; } - showPageSelection() { - return true; - } - showModificationCheckbox() { return true; } diff --git a/src/javascript/models/transformations/ToLineItemTransformation.jsx b/src/javascript/models/transformations/ToLineItemTransformation.jsx index 10090fd..e248025 100644 --- a/src/javascript/models/transformations/ToLineItemTransformation.jsx +++ b/src/javascript/models/transformations/ToLineItemTransformation.jsx @@ -16,10 +16,6 @@ export default class ToLineItemTransformation extends Transformation { this.showWhitespaces = false; } - showPageSelection() { - return true; - } - showModificationCheckbox() { return true; } diff --git a/src/javascript/models/transformations/ToTextItemTransformation.jsx b/src/javascript/models/transformations/ToTextItemTransformation.jsx index 9ae5c1b..bdf5998 100644 --- a/src/javascript/models/transformations/ToTextItemTransformation.jsx +++ b/src/javascript/models/transformations/ToTextItemTransformation.jsx @@ -16,10 +16,6 @@ export default class ToTextItemTransformation extends Transformation { this.showWhitespaces = false; } - showPageSelection() { - return true; - } - showModificationCheckbox() { return true; } diff --git a/src/javascript/models/transformations/Transformation.jsx b/src/javascript/models/transformations/Transformation.jsx index ae31d78..ee75bfc 100644 --- a/src/javascript/models/transformations/Transformation.jsx +++ b/src/javascript/models/transformations/Transformation.jsx @@ -14,10 +14,6 @@ export default class Transformation { this.itemType = itemType; } - showPageSelection() { - return true; - } - showModificationCheckbox() { return false; } diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/lineitem/CompactLines.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/CompactLines.jsx rename to src/javascript/models/transformations/lineitem/CompactLines.jsx diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/lineitem/DetectHeaders.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/DetectHeaders.jsx rename to src/javascript/models/transformations/lineitem/DetectHeaders.jsx diff --git a/src/javascript/models/transformations/textitem/DetectListItems.jsx b/src/javascript/models/transformations/lineitem/DetectListItems.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/DetectListItems.jsx rename to src/javascript/models/transformations/lineitem/DetectListItems.jsx diff --git a/src/javascript/models/transformations/textitem/DetectTOC.jsx b/src/javascript/models/transformations/lineitem/DetectTOC.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/DetectTOC.jsx rename to src/javascript/models/transformations/lineitem/DetectTOC.jsx diff --git a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/lineitem/RemoveRepetitiveElements.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx rename to src/javascript/models/transformations/lineitem/RemoveRepetitiveElements.jsx diff --git a/src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx b/src/javascript/models/transformations/lineitem/VerticalToHorizontal.jsx similarity index 100% rename from src/javascript/models/transformations/textitem/VerticalToHorizontal.jsx rename to src/javascript/models/transformations/lineitem/VerticalToHorizontal.jsx diff --git a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx index 1df39a3..83995be 100644 --- a/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/textitem/CalculateGlobalStats.jsx @@ -67,7 +67,7 @@ export default class CalculateGlobalStats extends ToTextItemTransformation { format = WordFormat.BOLD; } if (format) { - fontToFormats.set(key, format); + fontToFormats.set(key, format.name); } }); fontIdToName.sort();