mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 01:03:59 +01:00
Cleanups & readme
* Move line-item transformations to own package * Have WordFormat names instead of whole enum in globals * Rename PdfUploadView to UploadView * Correct license
This commit is contained in:
parent
cf5d81a1bb
commit
46a965785a
32
README.md
Normal file
32
README.md
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# PDF-To-Markdown Converter
|
||||||
|
|
||||||
|
Javascript tool to parse PDF files and convert them into Markdwon format. Online version at http://pdf2md.morethan.io!
|
||||||
|
|
||||||
|
## Major Changes
|
||||||
|
|
||||||
|
- **Apr 2017** - 0.1: Initial Release
|
||||||
|
|
||||||
|
## Contribute
|
||||||
|
|
||||||
|
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
|
||||||
|
|
||||||
|
#### Useful Build Commands
|
||||||
|
|
||||||
|
- ```npm install``` Download all necessary npm packages
|
||||||
|
- ```npm run lint``` Lint the javascript files
|
||||||
|
- ```npm run test``` Run tests
|
||||||
|
- ```npm run check``` Lint $ Test
|
||||||
|
- ```npm run watch``` Continuously build the project
|
||||||
|
- ```open build/index.html``` Open the build project in your default browser
|
||||||
|
- ```npm run release``` Build production version
|
||||||
|
- ```npm run deploy``` Build production version & move it to the github pages fodler
|
||||||
|
|
||||||
|
#### Release
|
||||||
|
- Increase version in package.json
|
||||||
|
- ```npm run deploy```
|
||||||
|
- commit & push
|
||||||
|
|
||||||
|
|
||||||
|
## Credits
|
||||||
|
|
||||||
|
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser
|
@ -8,6 +8,7 @@
|
|||||||
"build": "webpack",
|
"build": "webpack",
|
||||||
"lint": "eslint src --ext .js --ext .jsx --cache",
|
"lint": "eslint src --ext .js --ext .jsx --cache",
|
||||||
"test": "mocha --compilers js:babel-core/register test --recursive",
|
"test": "mocha --compilers js:babel-core/register test --recursive",
|
||||||
|
"check": "npm run lint && npm run test",
|
||||||
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
"release": "npm run lint && rm -rf build/* && NODE_ENV=production webpack -p",
|
||||||
"deploy": "npm run release && cp -r build/* docs/"
|
"deploy": "npm run release && cp -r build/* docs/"
|
||||||
},
|
},
|
||||||
@ -17,7 +18,7 @@
|
|||||||
"Converter"
|
"Converter"
|
||||||
],
|
],
|
||||||
"author": "Johannes Zillmann",
|
"author": "Johannes Zillmann",
|
||||||
"license": "Apache-2.0",
|
"license": "AGPL-3.0",
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/jzillmann/pdf-to-markdown"
|
"url": "https://github.com/jzillmann/pdf-to-markdown"
|
||||||
|
@ -5,7 +5,7 @@ import Grid from 'react-bootstrap/lib/Grid'
|
|||||||
import TopBar from './TopBar.jsx';
|
import TopBar from './TopBar.jsx';
|
||||||
import FooterBar from './FooterBar.jsx'
|
import FooterBar from './FooterBar.jsx'
|
||||||
import { View } from '../models/AppState.jsx';
|
import { View } from '../models/AppState.jsx';
|
||||||
import PdfUploadView from './PdfUploadView.jsx';
|
import UploadView from './UploadView.jsx';
|
||||||
import LoadingView from './LoadingView.jsx';
|
import LoadingView from './LoadingView.jsx';
|
||||||
import ResultView from './ResultView.jsx';
|
import ResultView from './ResultView.jsx';
|
||||||
import DebugView from './DebugView.jsx';
|
import DebugView from './DebugView.jsx';
|
||||||
@ -23,7 +23,7 @@ export default class App extends React.Component {
|
|||||||
var mainView;
|
var mainView;
|
||||||
switch (this.props.appState.mainView) {
|
switch (this.props.appState.mainView) {
|
||||||
case View.UPLOAD:
|
case View.UPLOAD:
|
||||||
mainView = <PdfUploadView uploadPdfFunction={ appState.storeFileBuffer } />
|
mainView = <UploadView uploadPdfFunction={ appState.storeFileBuffer } />
|
||||||
break;
|
break;
|
||||||
case View.LOADING:
|
case View.LOADING:
|
||||||
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
||||||
|
@ -116,10 +116,9 @@ export default class DebugView extends React.Component {
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
<AutoAffix viewportOffsetTop={ 15 } container={ this }>
|
<AutoAffix viewportOffsetTop={ 0 } offsetTop={ 0 } container={ this }>
|
||||||
<table>
|
<table>
|
||||||
<tbody>
|
<tbody>
|
||||||
{ lastTransformation.showPageSelection() &&
|
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
<div>
|
<div>
|
||||||
@ -146,7 +145,7 @@ export default class DebugView extends React.Component {
|
|||||||
Pages
|
Pages
|
||||||
</Label>
|
</Label>
|
||||||
</td>
|
</td>
|
||||||
</tr> }
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
<ButtonToolbar>
|
<ButtonToolbar>
|
||||||
|
@ -5,7 +5,7 @@ import Alert from 'react-bootstrap/lib/Alert'
|
|||||||
import Dropzone from 'react-dropzone'
|
import Dropzone from 'react-dropzone'
|
||||||
import FaCloudUpload from 'react-icons/lib/fa/cloud-upload'
|
import FaCloudUpload from 'react-icons/lib/fa/cloud-upload'
|
||||||
|
|
||||||
export default class PdfUploadView extends React.Component {
|
export default class UploadView extends React.Component {
|
||||||
|
|
||||||
static propTypes = {
|
static propTypes = {
|
||||||
uploadPdfFunction: React.PropTypes.func.isRequired,
|
uploadPdfFunction: React.PropTypes.func.isRequired,
|
||||||
@ -42,7 +42,7 @@ export default class PdfUploadView extends React.Component {
|
|||||||
<h1><FaCloudUpload width={ 100 } height={ 100 } /></h1>
|
<h1><FaCloudUpload width={ 100 } height={ 100 } /></h1>
|
||||||
<br/>
|
<br/>
|
||||||
<Alert bsStyle="warning">
|
<Alert bsStyle="warning">
|
||||||
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
|
<i>This tool converts a PDF file into a Markdown text format! Simply drag & drop your PDF file on the upload area and go from there. Don't expect wonders, there are a lot of variances in generated PDF's from different tools and different ages. No matter how good the parser works for your PDF, you will have to invest a good amount of manuell work to complete it. Though this tool aims to be general purpose, it has been tested on a certain set of PDF's only.</i>
|
||||||
</Alert>
|
</Alert>
|
||||||
</Dropzone>
|
</Dropzone>
|
||||||
<br/>
|
<br/>
|
@ -2,12 +2,12 @@ import { Enum } from 'enumify';
|
|||||||
|
|
||||||
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStats.jsx';
|
||||||
|
|
||||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
import CompactLines from './transformations/lineitem/CompactLines.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/lineitem/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/lineitem/VerticalToHorizontal.jsx';
|
||||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
import DetectTOC from './transformations/lineitem/DetectTOC.jsx'
|
||||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
import DetectListItems from './transformations/lineitem/DetectListItems.jsx'
|
||||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
import DetectHeaders from './transformations/lineitem/DetectHeaders.jsx'
|
||||||
|
|
||||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
import TextItem from './TextItem.jsx';
|
||||||
import Word from './Word.jsx';
|
import Word from './Word.jsx';
|
||||||
import WordType from './markdown/WordType.jsx';
|
import WordType from './markdown/WordType.jsx';
|
||||||
|
import WordFormat from './markdown/WordFormat.jsx';
|
||||||
import LineItem from './LineItem.jsx';
|
import LineItem from './LineItem.jsx';
|
||||||
import StashingStream from './StashingStream.jsx';
|
import StashingStream from './StashingStream.jsx';
|
||||||
import { ParsedElements } from './PageItem.jsx';
|
import { ParsedElements } from './PageItem.jsx';
|
||||||
@ -115,9 +116,10 @@ class WordDetectionStream extends StashingStream {
|
|||||||
results.push(...this.itemsToWords(stash, format));
|
results.push(...this.itemsToWords(stash, format));
|
||||||
}
|
}
|
||||||
|
|
||||||
itemsToWords(items, format) {
|
itemsToWords(items, formatName) {
|
||||||
const combinedText = combineText(items);
|
const combinedText = combineText(items);
|
||||||
const words = combinedText.split(' ');
|
const words = combinedText.split(' ');
|
||||||
|
const format = formatName ? WordFormat.enumValueOf(formatName) : null;
|
||||||
return words.filter(w => w.trim().length > 0).map(word => {
|
return words.filter(w => w.trim().length > 0).map(word => {
|
||||||
var type = null;
|
var type = null;
|
||||||
if (word.startsWith('http:')) {
|
if (word.startsWith('http:')) {
|
||||||
|
@ -16,10 +16,6 @@ export default class ToLineItemBlockTransformation extends Transformation {
|
|||||||
this.showWhitespaces = false;
|
this.showWhitespaces = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
showModificationCheckbox() {
|
showModificationCheckbox() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -16,10 +16,6 @@ export default class ToLineItemTransformation extends Transformation {
|
|||||||
this.showWhitespaces = false;
|
this.showWhitespaces = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
showModificationCheckbox() {
|
showModificationCheckbox() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -16,10 +16,6 @@ export default class ToTextItemTransformation extends Transformation {
|
|||||||
this.showWhitespaces = false;
|
this.showWhitespaces = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
showModificationCheckbox() {
|
showModificationCheckbox() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -14,10 +14,6 @@ export default class Transformation {
|
|||||||
this.itemType = itemType;
|
this.itemType = itemType;
|
||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
showModificationCheckbox() {
|
showModificationCheckbox() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
|||||||
format = WordFormat.BOLD;
|
format = WordFormat.BOLD;
|
||||||
}
|
}
|
||||||
if (format) {
|
if (format) {
|
||||||
fontToFormats.set(key, format);
|
fontToFormats.set(key, format.name);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
fontIdToName.sort();
|
fontIdToName.sort();
|
||||||
|
Loading…
Reference in New Issue
Block a user