mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-19 12:14:26 +01:00
Cleanups & readme
* Move line-item transformations to own package * Have WordFormat names instead of whole enum in globals * Rename PdfUploadView to UploadView * Correct license
This commit is contained in:
parent
cf5d81a1bb
commit
46a965785a
32
README.md
Normal file
32
README.md
Normal file
@ -0,0 +1,32 @@
|
||||
# PDF-To-Markdown Converter
|
||||
|
||||
Javascript tool to parse PDF files and convert them into Markdwon format. Online version at http://pdf2md.morethan.io!
|
||||
|
||||
## Major Changes
|
||||
|
||||
- **Apr 2017** - 0.1: Initial Release
|
||||
|
||||
## Contribute
|
||||
|
||||
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
|
||||
|
||||
#### Useful Build Commands
|
||||
|
||||
- ```npm install``` Download all necessary npm packages
|
||||
- ```npm run lint``` Lint the javascript files
|
||||
- ```npm run test``` Run tests
|
||||
- ```npm run check``` Lint $ Test
|
||||
- ```npm run watch``` Continuously build the project
|
||||
- ```open build/index.html``` Open the build project in your default browser
|
||||
- ```npm run release``` Build production version
|
||||
- ```npm run deploy``` Build production version & move it to the github pages fodler
|
||||
|
||||
#### Release
|
||||
- Increase version in package.json
|
||||
- ```npm run deploy```
|
||||
- commit & push
|
||||
|
||||
|
||||
## Credits
|
||||
|
||||
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser
|
@ -8,6 +8,7 @@
|
||||
"build": "webpack",
|
||||
"lint": "eslint src --ext .js --ext .jsx --cache",
|
||||
"test": "mocha --compilers js:babel-core/register test --recursive",
|
||||
"check": "npm run lint && npm run test",
|
||||
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
||||
"deploy": "npm run release && cp -r build/* docs/"
|
||||
},
|
||||
@ -17,7 +18,7 @@
|
||||
"Converter"
|
||||
],
|
||||
"author": "Johannes Zillmann",
|
||||
"license": "Apache-2.0",
|
||||
"license": "AGPL-3.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/jzillmann/pdf-to-markdown"
|
||||
|
@ -5,7 +5,7 @@ import Grid from 'react-bootstrap/lib/Grid'
|
||||
import TopBar from './TopBar.jsx';
|
||||
import FooterBar from './FooterBar.jsx'
|
||||
import { View } from '../models/AppState.jsx';
|
||||
import PdfUploadView from './PdfUploadView.jsx';
|
||||
import UploadView from './UploadView.jsx';
|
||||
import LoadingView from './LoadingView.jsx';
|
||||
import ResultView from './ResultView.jsx';
|
||||
import DebugView from './DebugView.jsx';
|
||||
@ -23,7 +23,7 @@ export default class App extends React.Component {
|
||||
var mainView;
|
||||
switch (this.props.appState.mainView) {
|
||||
case View.UPLOAD:
|
||||
mainView = <PdfUploadView uploadPdfFunction={ appState.storeFileBuffer } />
|
||||
mainView = <UploadView uploadPdfFunction={ appState.storeFileBuffer } />
|
||||
break;
|
||||
case View.LOADING:
|
||||
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
||||
|
@ -116,37 +116,36 @@ export default class DebugView extends React.Component {
|
||||
|
||||
return (
|
||||
<div>
|
||||
<AutoAffix viewportOffsetTop={ 15 } container={ this }>
|
||||
<AutoAffix viewportOffsetTop={ 0 } offsetTop={ 0 } container={ this }>
|
||||
<table>
|
||||
<tbody>
|
||||
{ lastTransformation.showPageSelection() &&
|
||||
<tr>
|
||||
<td>
|
||||
<div>
|
||||
<ul className='pagination'>
|
||||
<li className={ pageNr == -1 ? 'active' : '' }>
|
||||
<a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a>
|
||||
</li>
|
||||
</ul>
|
||||
<Pagination
|
||||
prev
|
||||
next
|
||||
first
|
||||
last
|
||||
ellipsis
|
||||
boundaryLinks
|
||||
items={ pages.length }
|
||||
maxButtons={ 17 }
|
||||
activePage={ this.state.pageNr + 1 }
|
||||
onSelect={ this.selectPage.bind(this) } />
|
||||
</div>
|
||||
</td>
|
||||
<td style={ { padding: '5px', textAlign: 'left' } }>
|
||||
<Label bsStyle="info">
|
||||
Pages
|
||||
</Label>
|
||||
</td>
|
||||
</tr> }
|
||||
<tr>
|
||||
<td>
|
||||
<div>
|
||||
<ul className='pagination'>
|
||||
<li className={ pageNr == -1 ? 'active' : '' }>
|
||||
<a role='button' onClick={ this.selectPage.bind(this, 0) }>ALL</a>
|
||||
</li>
|
||||
</ul>
|
||||
<Pagination
|
||||
prev
|
||||
next
|
||||
first
|
||||
last
|
||||
ellipsis
|
||||
boundaryLinks
|
||||
items={ pages.length }
|
||||
maxButtons={ 17 }
|
||||
activePage={ this.state.pageNr + 1 }
|
||||
onSelect={ this.selectPage.bind(this) } />
|
||||
</div>
|
||||
</td>
|
||||
<td style={ { padding: '5px', textAlign: 'left' } }>
|
||||
<Label bsStyle="info">
|
||||
Pages
|
||||
</Label>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<ButtonToolbar>
|
||||
|
@ -5,7 +5,7 @@ import Alert from 'react-bootstrap/lib/Alert'
|
||||
import Dropzone from 'react-dropzone'
|
||||
import FaCloudUpload from 'react-icons/lib/fa/cloud-upload'
|
||||
|
||||
export default class PdfUploadView extends React.Component {
|
||||
export default class UploadView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
uploadPdfFunction: React.PropTypes.func.isRequired,
|
||||
@ -42,7 +42,7 @@ export default class PdfUploadView extends React.Component {
|
||||
<h1><FaCloudUpload width={ 100 } height={ 100 } /></h1>
|
||||
<br/>
|
||||
<Alert bsStyle="warning">
|
||||
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
|
||||
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. No matter how good the parser works for your PDF, you will have to invest a good amount of manuell work to complete it. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
|
||||
</Alert>
|
||||
</Dropzone>
|
||||
<br/>
|
@ -2,12 +2,12 @@ import { Enum } from 'enumify';
|
||||
|
||||
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
||||
|
||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
import CompactLines from './transformations/lineitem/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/lineitem/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/lineitem/VerticalToHorizontal.jsx';
|
||||
import DetectTOC from './transformations/lineitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/lineitem/DetectListItems.jsx'
|
||||
import DetectHeaders from './transformations/lineitem/DetectHeaders.jsx'
|
||||
|
||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||
|
@ -1,6 +1,7 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import Word from './Word.jsx';
|
||||
import WordType from './markdown/WordType.jsx';
|
||||
import WordFormat from './markdown/WordFormat.jsx';
|
||||
import LineItem from './LineItem.jsx';
|
||||
import StashingStream from './StashingStream.jsx';
|
||||
import { ParsedElements } from './PageItem.jsx';
|
||||
@ -115,9 +116,10 @@ class WordDetectionStream extends StashingStream {
|
||||
results.push(...this.itemsToWords(stash, format));
|
||||
}
|
||||
|
||||
itemsToWords(items, format) {
|
||||
itemsToWords(items, formatName) {
|
||||
const combinedText = combineText(items);
|
||||
const words = combinedText.split(' ');
|
||||
const format = formatName ? WordFormat.enumValueOf(formatName) : null;
|
||||
return words.filter(w => w.trim().length > 0).map(word => {
|
||||
var type = null;
|
||||
if (word.startsWith('http:')) {
|
||||
|
@ -16,10 +16,6 @@ export default class ToLineItemBlockTransformation extends Transformation {
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
@ -16,10 +16,6 @@ export default class ToLineItemTransformation extends Transformation {
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
@ -16,10 +16,6 @@ export default class ToTextItemTransformation extends Transformation {
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
@ -14,10 +14,6 @@ export default class Transformation {
|
||||
this.itemType = itemType;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return false;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
format = WordFormat.BOLD;
|
||||
}
|
||||
if (format) {
|
||||
fontToFormats.set(key, format);
|
||||
fontToFormats.set(key, format.name);
|
||||
}
|
||||
});
|
||||
fontIdToName.sort();
|
||||
|
Loading…
Reference in New Issue
Block a user