mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-24 16:54:12 +01:00
[WIP] use fontMap to map fonts to formats
This commit is contained in:
parent
b5bb56b647
commit
d927b45087
@ -92,7 +92,7 @@ export default class DebugView extends React.Component {
|
|||||||
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
||||||
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
|
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
|
||||||
return <li key={ i }>
|
return <li key={ i }>
|
||||||
{ key + ': ' + parseResult.globals[key] }
|
{ key + ': ' + JSON.stringify(parseResult.globals[key]) }
|
||||||
</li>
|
</li>
|
||||||
});
|
});
|
||||||
const messagesAsList = parseResult.messages.map((message, i) => {
|
const messagesAsList = parseResult.messages.map((message, i) => {
|
||||||
|
@ -25,26 +25,8 @@ export default class AppState {
|
|||||||
this.mainView = View.UPLOAD;
|
this.mainView = View.UPLOAD;
|
||||||
this.fileBuffer;
|
this.fileBuffer;
|
||||||
this.metadata;
|
this.metadata;
|
||||||
this.fontMap;
|
|
||||||
this.pages = [];
|
this.pages = [];
|
||||||
this.transformations = [
|
this.transformations ;
|
||||||
new CalculateGlobalStats(),
|
|
||||||
new CompactLines(),
|
|
||||||
new RemoveRepetitiveElements(),
|
|
||||||
new VerticalToHorizontal(),
|
|
||||||
new PostprocessLines(),
|
|
||||||
new DetectTOC(),
|
|
||||||
new DetectListItems(),
|
|
||||||
new DetectHeaders(),
|
|
||||||
|
|
||||||
new GatherBlocks(),
|
|
||||||
new DetectCodeQuoteBlocks(),
|
|
||||||
new DetectListLevels(),
|
|
||||||
|
|
||||||
// new DetectFormats(),
|
|
||||||
// new HeadlineToUppercase(),
|
|
||||||
new ToTextBlocks(),
|
|
||||||
new ToMarkdown()];
|
|
||||||
|
|
||||||
//bind functions
|
//bind functions
|
||||||
this.render = this.render.bind(this);
|
this.render = this.render.bind(this);
|
||||||
@ -66,11 +48,31 @@ export default class AppState {
|
|||||||
|
|
||||||
storePdfPages(metadata, fontMap, pages) {
|
storePdfPages(metadata, fontMap, pages) {
|
||||||
this.metadata = metadata;
|
this.metadata = metadata;
|
||||||
this.fontMap = fontMap;
|
|
||||||
this.pages = pages;
|
this.pages = pages;
|
||||||
this.fileBuffer = null;
|
this.fileBuffer = null;
|
||||||
this.mainView = View.RESULT;
|
this.mainView = View.RESULT;
|
||||||
|
|
||||||
|
this.transformations = [
|
||||||
|
new CalculateGlobalStats(fontMap),
|
||||||
|
new CompactLines(),
|
||||||
|
new RemoveRepetitiveElements(),
|
||||||
|
new VerticalToHorizontal(),
|
||||||
|
new PostprocessLines(),
|
||||||
|
new DetectTOC(),
|
||||||
|
new DetectListItems(),
|
||||||
|
new DetectHeaders(),
|
||||||
|
|
||||||
|
new GatherBlocks(),
|
||||||
|
new DetectCodeQuoteBlocks(),
|
||||||
|
new DetectListLevels(),
|
||||||
|
|
||||||
|
// new DetectFormats(),
|
||||||
|
// new HeadlineToUppercase(),
|
||||||
|
new ToTextBlocks(),
|
||||||
|
new ToMarkdown()];
|
||||||
|
|
||||||
this.render();
|
this.render();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switchMainView(view) {
|
switchMainView(view) {
|
||||||
|
@ -2,6 +2,7 @@ import { Enum } from 'enumify';
|
|||||||
import TextItem from './TextItem.jsx';
|
import TextItem from './TextItem.jsx';
|
||||||
import TextItemBlock from './TextItemBlock.jsx';
|
import TextItemBlock from './TextItemBlock.jsx';
|
||||||
|
|
||||||
|
// An Markdown element
|
||||||
export default class ElementType extends Enum {
|
export default class ElementType extends Enum {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
6
src/javascript/models/StringFormat.jsx
Normal file
6
src/javascript/models/StringFormat.jsx
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
|
export default class StringFormat extends Enum {
|
||||||
|
}
|
||||||
|
|
||||||
|
StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE'])
|
@ -1,14 +1,15 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
|
import StringFormat from '../../StringFormat.jsx';
|
||||||
|
|
||||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor(fontMap) {
|
||||||
super("Calculate Statistics");
|
super("Calculate Statistics");
|
||||||
|
this.fontMap = fontMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
|
|
||||||
// Parse heights
|
// Parse heights
|
||||||
const heightToOccurrence = {};
|
const heightToOccurrence = {};
|
||||||
const fontToOccurrence = {};
|
const fontToOccurrence = {};
|
||||||
@ -48,6 +49,31 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
|||||||
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
||||||
|
|
||||||
|
|
||||||
|
const fontIdToName = [];
|
||||||
|
const fontToFormats = new Map();
|
||||||
|
this.fontMap.forEach(function(value, key) {
|
||||||
|
fontIdToName.push(key + " = " + value.name)
|
||||||
|
const fontName = value.name.toLowerCase();
|
||||||
|
var format;
|
||||||
|
if (key == mostUsedFont) {
|
||||||
|
format = StringFormat.STANDARD;
|
||||||
|
} else if (fontName.includes('bold') && fontName.includes('bold')) {
|
||||||
|
format = StringFormat.BOLD_OBLIQUE;
|
||||||
|
} else if (fontName.includes('bold')) {
|
||||||
|
format = StringFormat.BOLD;
|
||||||
|
} else if (fontName.includes('oblique')) {
|
||||||
|
format = StringFormat.OBLIQUE;
|
||||||
|
} else if (fontName === maxHeightFont) {
|
||||||
|
format = StringFormat.BOLD;
|
||||||
|
} else {
|
||||||
|
format = StringFormat.STANDARD;
|
||||||
|
}
|
||||||
|
fontToFormats.set(key, format);
|
||||||
|
});
|
||||||
|
fontIdToName.sort();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//Make a copy of the originals so all following transformation don't modify them
|
//Make a copy of the originals so all following transformation don't modify them
|
||||||
const newPages = parseResult.pages.map(page => {
|
const newPages = parseResult.pages.map(page => {
|
||||||
return {
|
return {
|
||||||
@ -68,11 +94,13 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
|||||||
mostUsedDistance: mostUsedDistance,
|
mostUsedDistance: mostUsedDistance,
|
||||||
maxHeight: maxHeight,
|
maxHeight: maxHeight,
|
||||||
maxHeightFont: maxHeightFont,
|
maxHeightFont: maxHeightFont,
|
||||||
|
fontToFormats: fontToFormats
|
||||||
},
|
},
|
||||||
messages: [
|
messages: [
|
||||||
'Items per height: ' + JSON.stringify(heightToOccurrence),
|
'Items per height: ' + JSON.stringify(heightToOccurrence),
|
||||||
'Items per font: ' + JSON.stringify(fontToOccurrence),
|
'Items per font: ' + JSON.stringify(fontToOccurrence),
|
||||||
'Items per distance: ' + JSON.stringify(distanceToOccurrence)
|
'Items per distance: ' + JSON.stringify(distanceToOccurrence),
|
||||||
|
'Fonts:' + JSON.stringify(fontIdToName)
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user