mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
[WIP] first draft complete formats transformation
This commit is contained in:
parent
81518a857b
commit
10cc7cf0ab
@ -52,8 +52,26 @@ export default class TextItemTable extends React.Component {
|
||||
{ textItem.type ? textItem.type.name : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? <div>
|
||||
Footnote-Link
|
||||
</div> : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.containLinks ? <div>
|
||||
Link
|
||||
</div> : '' }
|
||||
{ textItem.lineFormat ? <div>
|
||||
{ textItem.lineFormat.name }
|
||||
</div> : '' }
|
||||
{ textItem.unopenedFormat ? <div>
|
||||
Unopened
|
||||
{ ' ' + textItem.unopenedFormat.name }
|
||||
</div> : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.inlineFormats > 0 ? <div>
|
||||
{ textItem.parsedElements.inlineFormats + 'x Bold/Italic' }
|
||||
</div> : '' }
|
||||
{ textItem.unclosedFormat ? <div>
|
||||
Unclosed
|
||||
{ ' ' + textItem.unclosedFormat.name }
|
||||
</div> : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
|
@ -8,12 +8,11 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
|
||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
|
||||
|
||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
|
||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
|
||||
@ -59,15 +58,14 @@ export default class AppState {
|
||||
new VerticalToHorizontal(),
|
||||
new PostprocessLines(),
|
||||
new DetectTOC(),
|
||||
new DetectListItems(),
|
||||
new DetectHeaders(),
|
||||
new CompleteFormats(),
|
||||
new DetectListItems(),
|
||||
|
||||
new GatherBlocks(),
|
||||
new DetectCodeQuoteBlocks(),
|
||||
new DetectListLevels(),
|
||||
|
||||
// new DetectFormats(),
|
||||
// new HeadlineToUppercase(),
|
||||
new ToTextBlocks(),
|
||||
new ToMarkdown()];
|
||||
|
||||
|
@ -18,12 +18,14 @@ export class ParsedElements {
|
||||
this.footnoteLinks = options.footnoteLinks || [];
|
||||
this.footnotes = options.footnotes || [];
|
||||
this.containLinks = options.containLinks;
|
||||
this.inlineFormats = options.inlineFormats || 0;
|
||||
}
|
||||
|
||||
add(parsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
this.containLinks = this.containLinks || parsedElements.containLinks;
|
||||
this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
|
||||
}
|
||||
|
||||
}
|
@ -3,4 +3,23 @@ import { Enum } from 'enumify';
|
||||
export default class StringFormat extends Enum {
|
||||
}
|
||||
|
||||
StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE'])
|
||||
StringFormat.initEnum({
|
||||
STANDARD: {
|
||||
needFormat: false
|
||||
},
|
||||
BOLD: {
|
||||
needFormat: true,
|
||||
startSymbol: '**',
|
||||
endSymbol: '**'
|
||||
},
|
||||
OBLIQUE: {
|
||||
needFormat: true,
|
||||
startSymbol: '_',
|
||||
endSymbol: '_'
|
||||
},
|
||||
BOLD_OBLIQUE: {
|
||||
needFormat: true,
|
||||
startSymbol: '**_',
|
||||
endSymbol: '_**'
|
||||
}
|
||||
})
|
@ -13,6 +13,10 @@ export default class TextItem extends PageItem {
|
||||
this.font = options.font;
|
||||
this.fontAscent = options.fontAscent;
|
||||
this.fontDescent = options.fontDescent;
|
||||
|
||||
this.lineFormat = options.lineFormat;
|
||||
this.unopenedFormat = options.unopenedFormat;
|
||||
this.unclosedFormat = options.unclosedFormat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,10 +7,8 @@ import { sortByX } from '../textItemFunctions.jsx'
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemLineCompactor {
|
||||
|
||||
constructor(options) {
|
||||
if (options) {
|
||||
this.transformEmphasis = options.transformEmphasis || true;
|
||||
}
|
||||
constructor(fontToFormats) {
|
||||
this.fontToFormats = fontToFormats;
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
@ -22,8 +20,10 @@ export default class TextItemLineCompactor {
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
|
||||
var combinedItem;
|
||||
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
|
||||
const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
|
||||
|
||||
var combinedItem;
|
||||
if (resolvedLineItems.length == 1) {
|
||||
combinedItem = resolvedLineItems[0];
|
||||
} else {
|
||||
@ -51,9 +51,93 @@ export default class TextItemLineCompactor {
|
||||
});
|
||||
}
|
||||
combinedItem.parsedElements = parsedElements;
|
||||
combinedItem.lineFormat = lineFormat;
|
||||
combinedItem.unopenedFormat = unopenedFormat;
|
||||
combinedItem.unclosedFormat = unclosedFormat;
|
||||
return combinedItem;
|
||||
}
|
||||
|
||||
addFormats(resolvedLineItems, parsedElements) {
|
||||
var inlineFormats = 0;
|
||||
var openFormatType;
|
||||
var openFormatItem;
|
||||
var openFormatIndex;
|
||||
var lastItem;
|
||||
|
||||
var lineFormat;
|
||||
var unopenedFormat;
|
||||
var unclosedFormat;
|
||||
|
||||
const addStartSymbol = () => {
|
||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||
...openFormatItem,
|
||||
text: openFormatType.startSymbol + openFormatItem.text
|
||||
}));
|
||||
}
|
||||
const addEndSymbol = (index) => {
|
||||
resolvedLineItems.splice(index, 1, new TextItem({
|
||||
...lastItem,
|
||||
text: lastItem.text + openFormatType.endSymbol
|
||||
}));
|
||||
}
|
||||
const addCompleteSymbol = () => {
|
||||
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
|
||||
...openFormatItem,
|
||||
text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
|
||||
}));
|
||||
}
|
||||
|
||||
const rollupOpenFormat = (endIndex) => {
|
||||
const formatFromBeginningOfLine = openFormatIndex == 0;
|
||||
const formatToEndOfLine = endIndex == resolvedLineItems.length - 1;
|
||||
if (formatFromBeginningOfLine) {
|
||||
if (formatToEndOfLine) {
|
||||
lineFormat = openFormatType;
|
||||
} else {
|
||||
unopenedFormat = openFormatType;
|
||||
addEndSymbol(endIndex);
|
||||
}
|
||||
} else {
|
||||
if (formatToEndOfLine) {
|
||||
unclosedFormat = openFormatType;
|
||||
addStartSymbol();
|
||||
} else {
|
||||
inlineFormats++;
|
||||
if (lastItem === openFormatItem) {
|
||||
addCompleteSymbol();
|
||||
} else {
|
||||
addStartSymbol();
|
||||
addEndSymbol();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
resolvedLineItems.slice().forEach((item, i) => {
|
||||
const formatType = this.fontToFormats.get(item.font);
|
||||
if (openFormatType) {
|
||||
if (formatType !== openFormatType) { //closin existing format
|
||||
rollupOpenFormat(i - 1);
|
||||
openFormatType = formatType.needFormat ? formatType : null;
|
||||
openFormatItem = formatType.needFormat ? item : null;
|
||||
openFormatIndex = formatType.needFormat ? i : null;
|
||||
}
|
||||
} else {
|
||||
if (formatType.needFormat) {
|
||||
openFormatType = formatType;
|
||||
openFormatItem = item;
|
||||
openFormatIndex = i;
|
||||
}
|
||||
}
|
||||
lastItem = item;
|
||||
});
|
||||
if (openFormatType) {
|
||||
rollupOpenFormat(resolvedLineItems.length - 1);
|
||||
}
|
||||
parsedElements.inlineFormats = inlineFormats;
|
||||
return [lineFormat, unopenedFormat, unclosedFormat];
|
||||
}
|
||||
|
||||
resolveSpecialElements(lineItems) {
|
||||
const footnoteLinks = [];
|
||||
const footnotes = [];
|
||||
|
@ -57,11 +57,11 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
var format;
|
||||
if (key == mostUsedFont) {
|
||||
format = StringFormat.STANDARD;
|
||||
} else if (fontName.includes('bold') && fontName.includes('bold')) {
|
||||
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||
format = StringFormat.BOLD_OBLIQUE;
|
||||
} else if (fontName.includes('bold')) {
|
||||
format = StringFormat.BOLD;
|
||||
} else if (fontName.includes('oblique')) {
|
||||
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||
format = StringFormat.OBLIQUE;
|
||||
} else if (fontName === maxHeightFont) {
|
||||
format = StringFormat.BOLD;
|
||||
|
@ -2,6 +2,7 @@ import React from 'react';
|
||||
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { ParsedElements } from '../../PageItem.jsx';
|
||||
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
||||
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
@ -16,13 +17,18 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
const {mostUsedDistance, fontToFormats} = parseResult.globals;
|
||||
const foundFootnotes = [];
|
||||
const foundFootnoteLinks = [];
|
||||
var inlineFormats = 0;
|
||||
var lineFormats = 0;
|
||||
var unopenedFormats = 0;
|
||||
var unclosedFormats = 0;
|
||||
|
||||
const lineGrouper = new TextItemLineGrouper({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
const lineCompactor = new TextItemLineCompactor();
|
||||
const lineCompactor = new TextItemLineCompactor(fontToFormats);
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
if (page.items.length > 0) {
|
||||
@ -32,6 +38,13 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
var lineItem;
|
||||
if (textItemsOfLine.length == 1) {
|
||||
lineItem = textItemsOfLine[0];
|
||||
const formatType = fontToFormats.get(lineItem.font);
|
||||
if (formatType.needFormat) {
|
||||
lineItem.lineFormat = formatType;
|
||||
lineItem.parsedElements = new ParsedElements({
|
||||
completeLineFormats: 1
|
||||
});
|
||||
}
|
||||
} else {
|
||||
textItemsOfLine.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
@ -50,7 +63,11 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
inlineFormats += lineItem.parsedElements.inlineFormats;
|
||||
}
|
||||
if (lineItem.lineFormat) lineFormats++;
|
||||
if (lineItem.unopenedFormat) unopenedFormats++;
|
||||
if (lineItem.unclosedFormat) unclosedFormats++;
|
||||
lineItem.text = lineItem.text.trim();
|
||||
newItems.push(lineItem);
|
||||
});
|
||||
@ -62,9 +79,10 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
|
||||
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
|
||||
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
|
||||
'Detected ' + lineFormats + ' line formats',
|
||||
'Detected ' + inlineFormats + ' inline formats',
|
||||
'Detected ' + unclosedFormats + ' opened un-closed formats',
|
||||
'Detected ' + unopenedFormats + ' un-opened closed formats',
|
||||
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
|
||||
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
||||
]
|
||||
|
@ -0,0 +1,170 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
//Complete unopened/unclosed bold/italic formats
|
||||
export default class CompleteFormats extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Complete Bold/Italics");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
// remove line formats from headers
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(item => {
|
||||
if (item.type && item.type.headline) {
|
||||
if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
|
||||
item.lineFormat = null;
|
||||
item.unopenedFormat = null;
|
||||
item.unclosedFormat = null;
|
||||
item.annotation = UNCHANGED_ANNOTATION;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
//close open formats
|
||||
parseResult.pages.forEach(page => {
|
||||
const itemStack = new ItemStack();
|
||||
page.items.forEach(item => {
|
||||
itemStack.consume(item);
|
||||
});
|
||||
page.items = itemStack.getResults();
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: []
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class ItemStack {
|
||||
|
||||
constructor() {
|
||||
this.openFormat;
|
||||
this.openFormatItem = [];
|
||||
this.resultItems = [];
|
||||
}
|
||||
|
||||
cache(textItem, format) {
|
||||
this.openFormat = format;
|
||||
this.openFormatItem = textItem;
|
||||
}
|
||||
|
||||
closeOpenFormat() {
|
||||
if (this.openFormat) {
|
||||
this.openFormatItem.annotation = REMOVED_ANNOTATION;
|
||||
this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
|
||||
this.clear();
|
||||
}
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.openFormat = null;
|
||||
this.openFormatItem = null;
|
||||
}
|
||||
|
||||
writeToResults(textItem) {
|
||||
this.resultItems.push(textItem);
|
||||
}
|
||||
|
||||
|
||||
getResults() {
|
||||
if (this.openFormat) {
|
||||
this.closeOpenFormat();
|
||||
}
|
||||
return this.resultItems;
|
||||
}
|
||||
|
||||
consume(item) {
|
||||
const te = item.text;
|
||||
var newItem;
|
||||
|
||||
const handleFreshUnopened = () => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItem = textItemWithOpening(item, item.unopenedFormat);
|
||||
}
|
||||
|
||||
const handleFreshLine = () => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItem = textItemWithOpening(item, item.lineFormat);
|
||||
this.cache(newItem, item.lineFormat);
|
||||
}
|
||||
|
||||
const handleFreshUnclosed = () => {
|
||||
if (newItem) {
|
||||
this.cache(newItem, item.unclosedFormat);
|
||||
newItem = null;
|
||||
} else {
|
||||
this.cache(item, item.unclosedFormat);
|
||||
}
|
||||
}
|
||||
|
||||
//flush open format if possible
|
||||
if (this.openFormat) {
|
||||
if (item.unopenedFormat) {
|
||||
if (item.unopenedFormat === this.openFormat) {
|
||||
//good, closing an opened
|
||||
this.clear();
|
||||
} else {
|
||||
this.closeOpenFormat();
|
||||
handleFreshUnopened();
|
||||
}
|
||||
}
|
||||
|
||||
if (item.lineFormat) {
|
||||
if (item.lineFormat === this.openFormat) {
|
||||
this.cache(item, item.lineFormat);
|
||||
} else {
|
||||
this.closeOpenFormat();
|
||||
handleFreshLine();
|
||||
}
|
||||
}
|
||||
|
||||
if (item.unclosedFormat) {
|
||||
this.closeOpenFormat();
|
||||
handleFreshUnclosed();
|
||||
}
|
||||
|
||||
if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
|
||||
this.closeOpenFormat();
|
||||
}
|
||||
|
||||
} else { // handle fresh items
|
||||
if (item.unopenedFormat) {
|
||||
handleFreshUnopened()
|
||||
}
|
||||
if (item.lineFormat) {
|
||||
handleFreshLine();
|
||||
}
|
||||
if (item.unclosedFormat) {
|
||||
handleFreshUnclosed();
|
||||
}
|
||||
}
|
||||
|
||||
this.writeToResults(item);
|
||||
if (newItem) {
|
||||
this.writeToResults(newItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function textItemWithOpening(textItem, format) {
|
||||
return new TextItem({
|
||||
...textItem,
|
||||
text: format.startSymbol + textItem.text,
|
||||
annotation: ADDED_ANNOTATION
|
||||
});
|
||||
}
|
||||
|
||||
function textItemWithClosing(textItem, format) {
|
||||
return new TextItem({
|
||||
...textItem,
|
||||
text: textItem.text + format.endSymbol,
|
||||
annotation: ADDED_ANNOTATION
|
||||
});
|
||||
}
|
@ -3,6 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isListItem } from '../../../functions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectHeaders extends ToTextItemTransformation {
|
||||
@ -56,7 +57,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
var lastHeight;
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height > mostUsedHeight) {
|
||||
if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
|
||||
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
|
||||
heights.push(textItem.height);
|
||||
}
|
||||
@ -69,7 +70,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
|
||||
const headlineType = headlineByLevel(2 + i);
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == height) {
|
||||
if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
|
||||
detectedHeaders++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = headlineType;
|
||||
|
Loading…
Reference in New Issue
Block a user