mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 01:03:59 +01:00
Detect bold and emphasis
This commit is contained in:
parent
6441580889
commit
b7393fc806
@ -2,6 +2,7 @@ import { Enum } from 'enumify';
|
|||||||
|
|
||||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||||
|
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
@ -23,6 +24,7 @@ export default class AppState {
|
|||||||
this.transformations = [
|
this.transformations = [
|
||||||
new CalculateGlobalStats(),
|
new CalculateGlobalStats(),
|
||||||
new RoundCoordinates(),
|
new RoundCoordinates(),
|
||||||
|
new DetectFormats(),
|
||||||
new CombineSameY(),
|
new CombineSameY(),
|
||||||
new RemoveWhitespaces(),
|
new RemoveWhitespaces(),
|
||||||
new DetectFootnotes(),
|
new DetectFootnotes(),
|
||||||
|
@ -17,6 +17,12 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
|||||||
<li>
|
<li>
|
||||||
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
||||||
</li>
|
</li>
|
||||||
|
<li>
|
||||||
|
{ 'Max height: ' + parseResult.globals.maxHeight + ' ' }
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
{ 'Max height font: ' + parseResult.globals.maxHeightFont + ' ' }
|
||||||
|
</li>
|
||||||
<hr/>
|
<hr/>
|
||||||
<li>
|
<li>
|
||||||
{ 'Items per height: ' + JSON.stringify(parseResult.summary.heightToOccurrence) + ' ' }
|
{ 'Items per height: ' + JSON.stringify(parseResult.summary.heightToOccurrence) + ' ' }
|
||||||
@ -31,17 +37,25 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
|||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const heightToOccurrence = {};
|
const heightToOccurrence = {};
|
||||||
const fontToOccurrence = {};
|
const fontToOccurrence = {};
|
||||||
|
var maxHeight = 0;
|
||||||
|
var maxHeightFont;
|
||||||
parseResult.content.forEach(page => {
|
parseResult.content.forEach(page => {
|
||||||
page.textItems.forEach(item => {
|
page.textItems.forEach(item => {
|
||||||
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
|
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
|
||||||
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
|
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
|
||||||
|
if (item.height > maxHeight) {
|
||||||
|
maxHeight = item.height;
|
||||||
|
maxHeightFont = item.font;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
||||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||||
parseResult.globals = {
|
parseResult.globals = {
|
||||||
mostUsedHeight: mostUsedHeight,
|
mostUsedHeight: mostUsedHeight,
|
||||||
mostUsedFont: mostUsedFont
|
mostUsedFont: mostUsedFont,
|
||||||
|
maxHeight: maxHeight,
|
||||||
|
maxHeightFont: maxHeightFont
|
||||||
}
|
}
|
||||||
parseResult.summary = {
|
parseResult.summary = {
|
||||||
heightToOccurrence: heightToOccurrence,
|
heightToOccurrence: heightToOccurrence,
|
||||||
|
178
src/javascript/models/transformations/DetectFormats.jsx
Normal file
178
src/javascript/models/transformations/DetectFormats.jsx
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
//Detect word/sentence formats like bold, italic,...
|
||||||
|
export default class DetectFormats extends ToPdfViewTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Bold/Italic");
|
||||||
|
}
|
||||||
|
|
||||||
|
createSummaryView(parseResult:ParseResult) {
|
||||||
|
return <div>
|
||||||
|
Detected
|
||||||
|
{ ' ' + parseResult.summary.foundFormats + ' ' } formats.
|
||||||
|
</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
var foundFormats = 0;
|
||||||
|
const {mostUsedHeight, mostUsedFont, maxHeightFont} = parseResult.globals;
|
||||||
|
const symbols = {
|
||||||
|
bold: '**',
|
||||||
|
emphasis: '_'
|
||||||
|
}
|
||||||
|
|
||||||
|
const newContent = parseResult.content.map(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
|
||||||
|
//bundle items on same Y
|
||||||
|
const groupedItems = groupByFollowingY(page.textItems);
|
||||||
|
var lastItem;
|
||||||
|
var lastFormat;
|
||||||
|
|
||||||
|
const addNextItem = (item, format) => {
|
||||||
|
if (lastItem) {
|
||||||
|
if (lastFormat !== format) {
|
||||||
|
lastItem.text = appendSymbol(lastItem.text, symbols[lastFormat]);
|
||||||
|
if (lastItem.annotation) {
|
||||||
|
lastItem.annotation = newAnnotation(lastFormat);
|
||||||
|
} else {
|
||||||
|
lastItem.annotation = newAnnotation('End ' + lastFormat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastItem.height = mostUsedHeight;
|
||||||
|
newTextItems.push(lastItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (format) {
|
||||||
|
if (lastFormat !== format) {
|
||||||
|
item.text = prependSymbol(item.text, symbols[format]);
|
||||||
|
item.annotation = newAnnotation('Start ' + format);
|
||||||
|
}
|
||||||
|
lastItem = item;
|
||||||
|
lastFormat = format;
|
||||||
|
} else {
|
||||||
|
newTextItems.push(item);
|
||||||
|
lastItem = null;
|
||||||
|
lastFormat = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
groupedItems.forEach(itemGroup => {
|
||||||
|
|
||||||
|
//probably headline
|
||||||
|
const differentHeightsButSameFont = itemsHaveDifferentHeightsButSameFont(itemGroup);
|
||||||
|
|
||||||
|
itemGroup.forEach(item => {
|
||||||
|
const paragraphHeighOrSlightlyBigger = item.height == mostUsedHeight || item.height == mostUsedHeight + 1;
|
||||||
|
if (!differentHeightsButSameFont && paragraphHeighOrSlightlyBigger && item.font !== mostUsedFont) {
|
||||||
|
// item.annotation = REMOVED_ANNOTATION;
|
||||||
|
|
||||||
|
const format = item.font === maxHeightFont ? 'bold' : 'emphasis';
|
||||||
|
addNextItem(item, format);
|
||||||
|
|
||||||
|
//TODO test with womb compilation. _Th_, _ff_,... check font like SanSarif ?
|
||||||
|
//TODO don't touch 'eingerückte' Zeichen => detect early ?
|
||||||
|
//TODO (Maybe) could detect combined bold & emphasis like font=bold.font + emph.font !?
|
||||||
|
foundFormats++;
|
||||||
|
} else {
|
||||||
|
addNextItem(item);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
...page,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
});
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
content: newContent,
|
||||||
|
summary: {
|
||||||
|
foundFormats: foundFormats
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
completeTransform(parseResult:ParseResult) {
|
||||||
|
parseResult.content.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return parseResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function newAnnotation(name) {
|
||||||
|
return new Annotation({
|
||||||
|
category: name,
|
||||||
|
color: 'green'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
//groups all following text items with the same Y together
|
||||||
|
function groupByFollowingY(textItems) {
|
||||||
|
const yArrays = [];
|
||||||
|
var itemsWithSameY = [];
|
||||||
|
var lastItem;
|
||||||
|
textItems.forEach(item => {
|
||||||
|
if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
|
||||||
|
itemsWithSameY.push(item);
|
||||||
|
} else {
|
||||||
|
yArrays.push(itemsWithSameY);
|
||||||
|
itemsWithSameY = [item];
|
||||||
|
}
|
||||||
|
lastItem = item;
|
||||||
|
})
|
||||||
|
yArrays.push(itemsWithSameY);
|
||||||
|
return yArrays;
|
||||||
|
}
|
||||||
|
|
||||||
|
function itemsHaveDifferentHeightsButSameFont(itemGroup) {
|
||||||
|
var heights = new Set();
|
||||||
|
var fonts = new Set();
|
||||||
|
itemGroup.forEach(item => {
|
||||||
|
heights.add(item.height);
|
||||||
|
fonts.add(item.font);
|
||||||
|
});
|
||||||
|
return heights.size > 1 && fonts.size == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO move to stringFunctions
|
||||||
|
|
||||||
|
function prependSymbol(text, symbol) {
|
||||||
|
if (text.charAt(0) == ' ') {
|
||||||
|
return ' ' + symbol + removeLeadingWhitespace(text);
|
||||||
|
}
|
||||||
|
return symbol + text;
|
||||||
|
}
|
||||||
|
|
||||||
|
function appendSymbol(text, symbol) {
|
||||||
|
if (text.charAt(text.length - 1) == ' ') {
|
||||||
|
return removeTrailingWhitespace(text) + symbol + ' ';
|
||||||
|
}
|
||||||
|
return text + symbol;
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeLeadingWhitespace(text) {
|
||||||
|
while (text.charAt(0) == ' ') {
|
||||||
|
text = text.substring(1, text.length);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeTrailingWhitespace(text) {
|
||||||
|
while (text.charAt(text.length - 1) == ' ') {
|
||||||
|
text = text.substring(0, text.length - 1);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user