Cleanups & readme

* Move line-item transformations to own package
* Have WordFormat names instead of whole enum in globals
* Rename PdfUploadView to UploadView
* Correct license
This commit is contained in:
Johannes Zillmann 2017-03-30 07:40:21 +02:00
parent cf5d81a1bb
commit 46a965785a
18 changed files with 76 additions and 58 deletions

32
README.md Normal file
View File

@ -0,0 +1,32 @@
# PDF-To-Markdown Converter
Javascript tool to parse PDF files and convert them into Markdwon format. Online version at http://pdf2md.morethan.io!
## Major Changes
- **Apr 2017** - 0.1: Initial Release
## Contribute
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
#### Useful Build Commands
- ```npm install``` Download all necessary npm packages
- ```npm run lint``` Lint the javascript files
- ```npm run test``` Run tests
- ```npm run check``` Lint $ Test
- ```npm run watch``` Continuously build the project
- ```open build/index.html``` Open the build project in your default browser
- ```npm run release``` Build production version
- ```npm run deploy``` Build production version & move it to the github pages fodler
#### Release
- Increase version in package.json
- ```npm run deploy```
- commit & push
## Credits
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser

View File

@ -8,6 +8,7 @@
"build": "webpack", "build": "webpack",
"lint": "eslint src --ext .js --ext .jsx --cache", "lint": "eslint src --ext .js --ext .jsx --cache",
"test": "mocha --compilers js:babel-core/register test --recursive", "test": "mocha --compilers js:babel-core/register test --recursive",
"check": "npm run lint && npm run test",
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p", "release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
"deploy": "npm run release && cp -r build/* docs/" "deploy": "npm run release && cp -r build/* docs/"
}, },
@ -17,7 +18,7 @@
"Converter" "Converter"
], ],
"author": "Johannes Zillmann", "author": "Johannes Zillmann",
"license": "Apache-2.0", "license": "AGPL-3.0",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "https://github.com/jzillmann/pdf-to-markdown" "url": "https://github.com/jzillmann/pdf-to-markdown"

View File

@ -5,7 +5,7 @@ import Grid from 'react-bootstrap/lib/Grid'
import TopBar from './TopBar.jsx'; import TopBar from './TopBar.jsx';
import FooterBar from './FooterBar.jsx' import FooterBar from './FooterBar.jsx'
import { View } from '../models/AppState.jsx'; import { View } from '../models/AppState.jsx';
import PdfUploadView from './PdfUploadView.jsx'; import UploadView from './UploadView.jsx';
import LoadingView from './LoadingView.jsx'; import LoadingView from './LoadingView.jsx';
import ResultView from './ResultView.jsx'; import ResultView from './ResultView.jsx';
import DebugView from './DebugView.jsx'; import DebugView from './DebugView.jsx';
@ -23,7 +23,7 @@ export default class App extends React.Component {
var mainView; var mainView;
switch (this.props.appState.mainView) { switch (this.props.appState.mainView) {
case View.UPLOAD: case View.UPLOAD:
mainView = <PdfUploadView uploadPdfFunction={ appState.storeFileBuffer } /> mainView = <UploadView uploadPdfFunction={ appState.storeFileBuffer } />
break; break;
case View.LOADING: case View.LOADING:
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } /> mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />

View File

@ -116,37 +116,36 @@ export default class DebugView extends React.Component {
return ( return (
<div> <div>
<AutoAffix viewportOffsetTop={ 15 } container={ this }> <AutoAffix viewportOffsetTop={ 0 } offsetTop={ 0 } container={ this }>
<table> <table>
<tbody> <tbody>
{ lastTransformation.showPageSelection() && <tr>
<tr> <td>
<td> <div>
<div> <ul className='pagination'>
<ul className='pagination'> <li className={ pageNr == -1 ? 'active' : '' }>
<li className={ pageNr == -1 ? 'active' : '' }> <a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a>
<a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a> </li>
</li> </ul>
</ul> <Pagination
<Pagination prev
prev next
next first
first last
last ellipsis
ellipsis boundaryLinks
boundaryLinks items={ pages.length }
items={ pages.length } maxButtons={ 17 }
maxButtons={ 17 } activePage={ this.state.pageNr + 1 }
activePage={ this.state.pageNr + 1 } onSelect={ this.selectPage.bind(this) } />
onSelect={ this.selectPage.bind(this) } /> </div>
</div> </td>
</td> <td style={ { padding: '5px', textAlign: 'left' } }>
<td style={ { padding: '5px', textAlign: 'left' } }> <Label bsStyle="info">
<Label bsStyle="info"> Pages
Pages </Label>
</Label> </td>
</td> </tr>
</tr> }
<tr> <tr>
<td> <td>
<ButtonToolbar> <ButtonToolbar>

View File

@ -5,7 +5,7 @@ import Alert from 'react-bootstrap/lib/Alert'
import Dropzone from 'react-dropzone' import Dropzone from 'react-dropzone'
import FaCloudUpload from 'react-icons/lib/fa/cloud-upload' import FaCloudUpload from 'react-icons/lib/fa/cloud-upload'
export default class PdfUploadView extends React.Component { export default class UploadView extends React.Component {
static propTypes = { static propTypes = {
uploadPdfFunction: React.PropTypes.func.isRequired, uploadPdfFunction: React.PropTypes.func.isRequired,
@ -42,7 +42,7 @@ export default class PdfUploadView extends React.Component {
<h1><FaCloudUpload width={ 100 } height={ 100 } /></h1> <h1><FaCloudUpload width={ 100 } height={ 100 } /></h1>
<br/> <br/>
<Alert bsStyle="warning"> <Alert bsStyle="warning">
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i> <i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. No matter how good the parser works for your PDF, you will have to invest a good amount of manuell work to complete it. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
</Alert> </Alert>
</Dropzone> </Dropzone>
<br/> <br/>

View File

@ -2,12 +2,12 @@ import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx'; import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
import CompactLines from './transformations/textitem/CompactLines.jsx'; import CompactLines from './transformations/lineitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/lineitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/lineitem/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/lineitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/lineitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/lineitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'

View File

@ -1,6 +1,7 @@
import TextItem from './TextItem.jsx'; import TextItem from './TextItem.jsx';
import Word from './Word.jsx'; import Word from './Word.jsx';
import WordType from './markdown/WordType.jsx'; import WordType from './markdown/WordType.jsx';
import WordFormat from './markdown/WordFormat.jsx';
import LineItem from './LineItem.jsx'; import LineItem from './LineItem.jsx';
import StashingStream from './StashingStream.jsx'; import StashingStream from './StashingStream.jsx';
import { ParsedElements } from './PageItem.jsx'; import { ParsedElements } from './PageItem.jsx';
@ -115,9 +116,10 @@ class WordDetectionStream extends StashingStream {
results.push(...this.itemsToWords(stash, format)); results.push(...this.itemsToWords(stash, format));
} }
itemsToWords(items, format) { itemsToWords(items, formatName) {
const combinedText = combineText(items); const combinedText = combineText(items);
const words = combinedText.split(' '); const words = combinedText.split(' ');
const format = formatName ? WordFormat.enumValueOf(formatName) : null;
return words.filter(w => w.trim().length > 0).map(word => { return words.filter(w => w.trim().length > 0).map(word => {
var type = null; var type = null;
if (word.startsWith('http:')) { if (word.startsWith('http:')) {

View File

@ -16,10 +16,6 @@ export default class ToLineItemBlockTransformation extends Transformation {
this.showWhitespaces = false; this.showWhitespaces = false;
} }
showPageSelection() {
return true;
}
showModificationCheckbox() { showModificationCheckbox() {
return true; return true;
} }

View File

@ -16,10 +16,6 @@ export default class ToLineItemTransformation extends Transformation {
this.showWhitespaces = false; this.showWhitespaces = false;
} }
showPageSelection() {
return true;
}
showModificationCheckbox() { showModificationCheckbox() {
return true; return true;
} }

View File

@ -16,10 +16,6 @@ export default class ToTextItemTransformation extends Transformation {
this.showWhitespaces = false; this.showWhitespaces = false;
} }
showPageSelection() {
return true;
}
showModificationCheckbox() { showModificationCheckbox() {
return true; return true;
} }

View File

@ -14,10 +14,6 @@ export default class Transformation {
this.itemType = itemType; this.itemType = itemType;
} }
showPageSelection() {
return true;
}
showModificationCheckbox() { showModificationCheckbox() {
return false; return false;
} }

View File

@ -67,7 +67,7 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
format = WordFormat.BOLD; format = WordFormat.BOLD;
} }
if (format) { if (format) {
fontToFormats.set(key, format); fontToFormats.set(key, format.name);
} }
}); });
fontIdToName.sort(); fontIdToName.sort();