Cleanups & readme

* Move line-item transformations to own package
* Have WordFormat names instead of whole enum in globals
* Rename PdfUploadView to UploadView
* Correct license
This commit is contained in:
Johannes Zillmann 2017-03-30 07:40:21 +02:00
parent cf5d81a1bb
commit 46a965785a
18 changed files with 76 additions and 58 deletions

32
README.md Normal file
View File

@ -0,0 +1,32 @@
# PDF-To-Markdown Converter
Javascript tool to parse PDF files and convert them into Markdwon format. Online version at http://pdf2md.morethan.io!
## Major Changes
- **Apr 2017** - 0.1: Initial Release
## Contribute
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
#### Useful Build Commands
- ```npm install``` Download all necessary npm packages
- ```npm run lint``` Lint the javascript files
- ```npm run test``` Run tests
- ```npm run check``` Lint $ Test
- ```npm run watch``` Continuously build the project
- ```open build/index.html``` Open the build project in your default browser
- ```npm run release``` Build production version
- ```npm run deploy``` Build production version & move it to the github pages fodler
#### Release
- Increase version in package.json
- ```npm run deploy```
- commit & push
## Credits
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser

View File

@ -8,6 +8,7 @@
"build": "webpack",
"lint": "eslint src --ext .js --ext .jsx --cache",
"test": "mocha --compilers js:babel-core/register test --recursive",
"check": "npm run lint && npm run test",
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
"deploy": "npm run release && cp -r build/* docs/"
},
@ -17,7 +18,7 @@
"Converter"
],
"author": "Johannes Zillmann",
"license": "Apache-2.0",
"license": "AGPL-3.0",
"repository": {
"type": "git",
"url": "https://github.com/jzillmann/pdf-to-markdown"

View File

@ -5,7 +5,7 @@ import Grid from 'react-bootstrap/lib/Grid'
import TopBar from './TopBar.jsx';
import FooterBar from './FooterBar.jsx'
import { View } from '../models/AppState.jsx';
import PdfUploadView from './PdfUploadView.jsx';
import UploadView from './UploadView.jsx';
import LoadingView from './LoadingView.jsx';
import ResultView from './ResultView.jsx';
import DebugView from './DebugView.jsx';
@ -23,7 +23,7 @@ export default class App extends React.Component {
var mainView;
switch (this.props.appState.mainView) {
case View.UPLOAD:
mainView = <PdfUploadView uploadPdfFunction={ appState.storeFileBuffer } />
mainView = <UploadView uploadPdfFunction={ appState.storeFileBuffer } />
break;
case View.LOADING:
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />

View File

@ -116,37 +116,36 @@ export default class DebugView extends React.Component {
return (
<div>
<AutoAffix viewportOffsetTop={ 15 } container={ this }>
<AutoAffix viewportOffsetTop={ 0 } offsetTop={ 0 } container={ this }>
<table>
<tbody>
{ lastTransformation.showPageSelection() &&
<tr>
<td>
<div>
<ul className='pagination'>
<li className={ pageNr == -1 ? 'active' : '' }>
<a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a>
</li>
</ul>
<Pagination
prev
next
first
last
ellipsis
boundaryLinks
items={ pages.length }
maxButtons={ 17 }
activePage={ this.state.pageNr + 1 }
onSelect={ this.selectPage.bind(this) } />
</div>
</td>
<td style={ { padding: '5px', textAlign: 'left' } }>
<Label bsStyle="info">
Pages
</Label>
</td>
</tr> }
<tr>
<td>
<div>
<ul className='pagination'>
<li className={ pageNr == -1 ? 'active' : '' }>
<a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a>
</li>
</ul>
<Pagination
prev
next
first
last
ellipsis
boundaryLinks
items={ pages.length }
maxButtons={ 17 }
activePage={ this.state.pageNr + 1 }
onSelect={ this.selectPage.bind(this) } />
</div>
</td>
<td style={ { padding: '5px', textAlign: 'left' } }>
<Label bsStyle="info">
Pages
</Label>
</td>
</tr>
<tr>
<td>
<ButtonToolbar>

View File

@ -5,7 +5,7 @@ import Alert from 'react-bootstrap/lib/Alert'
import Dropzone from 'react-dropzone'
import FaCloudUpload from 'react-icons/lib/fa/cloud-upload'
export default class PdfUploadView extends React.Component {
export default class UploadView extends React.Component {
static propTypes = {
uploadPdfFunction: React.PropTypes.func.isRequired,
@ -42,7 +42,7 @@ export default class PdfUploadView extends React.Component {
<h1><FaCloudUpload width={ 100 } height={ 100 } /></h1>
<br/>
<Alert bsStyle="warning">
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. No matter how good the parser works for your PDF, you will have to invest a good amount of manuell work to complete it. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
</Alert>
</Dropzone>
<br/>

View File

@ -2,12 +2,12 @@ import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import CompactLines from './transformations/lineitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/lineitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/lineitem/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/lineitem/DetectTOC.jsx'
import DetectListItems from './transformations/lineitem/DetectListItems.jsx'
import DetectHeaders from './transformations/lineitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'

View File

@ -1,6 +1,7 @@
import TextItem from './TextItem.jsx';
import Word from './Word.jsx';
import WordType from './markdown/WordType.jsx';
import WordFormat from './markdown/WordFormat.jsx';
import LineItem from './LineItem.jsx';
import StashingStream from './StashingStream.jsx';
import { ParsedElements } from './PageItem.jsx';
@ -115,9 +116,10 @@ class WordDetectionStream extends StashingStream {
results.push(...this.itemsToWords(stash, format));
}
itemsToWords(items, format) {
itemsToWords(items, formatName) {
const combinedText = combineText(items);
const words = combinedText.split(' ');
const format = formatName ? WordFormat.enumValueOf(formatName) : null;
return words.filter(w => w.trim().length > 0).map(word => {
var type = null;
if (word.startsWith('http:')) {

View File

@ -16,10 +16,6 @@ export default class ToLineItemBlockTransformation extends Transformation {
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}

View File

@ -16,10 +16,6 @@ export default class ToLineItemTransformation extends Transformation {
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}

View File

@ -16,10 +16,6 @@ export default class ToTextItemTransformation extends Transformation {
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}

View File

@ -14,10 +14,6 @@ export default class Transformation {
this.itemType = itemType;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return false;
}

View File

@ -67,7 +67,7 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
format = WordFormat.BOLD;
}
if (format) {
fontToFormats.set(key, format);
fontToFormats.set(key, format.name);
}
});
fontIdToName.sort();