mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-19 03:58:24 +01:00
[WIP] remove MarkdownElement in favor of ElementType enum
This commit is contained in:
parent
15c5946073
commit
f8fecc4c1d
@ -1,37 +1,86 @@
|
||||
import { Enum } from 'enumify';
|
||||
import TextItem from './TextItem.jsx';
|
||||
import TextItemBlock from './TextItemBlock.jsx';
|
||||
|
||||
export default class ElementType extends Enum {
|
||||
}
|
||||
|
||||
ElementType.initEnum({
|
||||
H1: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '# ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H2: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '## ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H3: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H4: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '#### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H5: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '##### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
H6: {
|
||||
|
||||
toText(block:TextItemBlock) {
|
||||
return '###### ' + concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
TOC: {
|
||||
mergeToBlock: true
|
||||
mergeToBlock: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
FOOTNOTES: {
|
||||
mergeToBlock: true,
|
||||
mergeFollowingNonTypedItems: true
|
||||
mergeFollowingNonTypedItems: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
CODE: {
|
||||
toText(block:TextItemBlock) {
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
}
|
||||
},
|
||||
LIST: {
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
},
|
||||
PARAGRAPH: {
|
||||
toText(block:TextItemBlock) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
//export default ElementType
|
||||
export function blockToText(block: TextItemBlock) {
|
||||
if (!block.type) {
|
||||
return concatTextItems(block.textItems);
|
||||
}
|
||||
console.debug(block.type);
|
||||
return block.type.toText(block);
|
||||
}
|
||||
|
||||
function concatTextItems(textItems: TextItem[]) {
|
||||
var text = '';
|
||||
textItems.forEach(item => {
|
||||
text += item.text + '\n';
|
||||
});
|
||||
return text;
|
||||
}
|
||||
|
||||
export function headlineByLevel(level) {
|
||||
if (level == 1) {
|
||||
|
@ -1,75 +0,0 @@
|
||||
import TextItemBlock from './TextItemBlock.jsx';
|
||||
import TextItemCombiner from './TextItemCombiner.jsx';
|
||||
import TextItem from './TextItem.jsx';
|
||||
|
||||
export const HEADLINE1 = "Headline 1";
|
||||
export const HEADLINE2 = "Headline 2";
|
||||
export const HEADLINE3 = "Headline 3";
|
||||
export const HEADLINE4 = "Headline 4";
|
||||
export const HEADLINE5 = "Headline 5";
|
||||
export const HEADLINE6 = "Headline 6";
|
||||
export const PARAGRAPH = "Paragraph";
|
||||
export const LIST_BLOCK = "List";
|
||||
export const CODE_BLOCK = "Code/Quote";
|
||||
export const TOC_BLOCK = "TOC";
|
||||
export const FOOTNOTE_BLOCK = "Footnotes"
|
||||
|
||||
export function headlineByLevel(level) {
|
||||
if (level == 1) {
|
||||
return HEADLINE1;
|
||||
} else if (level == 2) {
|
||||
return HEADLINE2;
|
||||
} else if (level == 3) {
|
||||
return HEADLINE3;
|
||||
} else if (level == 4) {
|
||||
return HEADLINE4;
|
||||
} else if (level == 5) {
|
||||
return HEADLINE5;
|
||||
} else if (level == 6) {
|
||||
return HEADLINE6;
|
||||
}
|
||||
throw "Unsupported headline level: " + level;
|
||||
}
|
||||
|
||||
export function blockToText(block: TextItemBlock) {
|
||||
switch (block.type) {
|
||||
case CODE_BLOCK:
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
case TOC_BLOCK:
|
||||
var text = '';
|
||||
//TODO real links
|
||||
//TODO de-duplicate with DetectLists ?
|
||||
block.textItems.forEach(item => {
|
||||
text += item.text + '\n';
|
||||
});
|
||||
return text;
|
||||
case HEADLINE1:
|
||||
return '# ' + concatTextItems(block.textItems);
|
||||
case HEADLINE2:
|
||||
return '## ' + concatTextItems(block.textItems);
|
||||
case HEADLINE3:
|
||||
return '### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE4:
|
||||
return '#### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE5:
|
||||
return '##### ' + concatTextItems(block.textItems);
|
||||
case HEADLINE6:
|
||||
return '###### ' + concatTextItems(block.textItems);
|
||||
default:
|
||||
var textItems = block.textItems;
|
||||
if (!block.type) {
|
||||
//TODO mostUsedDistance
|
||||
textItems = new TextItemCombiner({}).combine(textItems).textItems;
|
||||
}
|
||||
return concatTextItems(textItems);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function concatTextItems(textItems: TextItem[]) {
|
||||
var text = '';
|
||||
textItems.forEach(item => {
|
||||
text += item.text + '\n';
|
||||
});
|
||||
return text;
|
||||
}
|
@ -3,7 +3,7 @@ import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { CODE_BLOCK } from '../MarkdownElements.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
@ -55,7 +55,7 @@ export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
|
||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
||||
} else {
|
||||
preceedingCodeBlock = new TextItemBlock({
|
||||
type: CODE_BLOCK,
|
||||
type: ElementType.CODE,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
textItems: combineResult.textItems,
|
||||
parsedElements: combineResult.parsedElements
|
||||
|
@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { headlineByLevel } from '../ElementType.jsx';
|
||||
|
||||
//Detect headlines
|
||||
export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
@ -157,9 +158,9 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (height == maxHeight) {
|
||||
addNewBlock(newBlocks, combineResult, HEADLINE1);
|
||||
addNewBlock(newBlocks, combineResult, ElementType.H1);
|
||||
} else if (combineResult.textItems.length == 1) {
|
||||
addNewBlock(newBlocks, combineResult, HEADLINE2);
|
||||
addNewBlock(newBlocks, combineResult, ElementType.H2);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -4,7 +4,7 @@ import TextItem from '../TextItem.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
@ -83,14 +83,14 @@ export default class DetectLists extends ToTextItemBlockTransformation {
|
||||
if (itemsBeforeFirstLineItem.length > 0) {
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: itemsBeforeFirstLineItem,
|
||||
type: PARAGRAPH,
|
||||
type: ElementType.PARAGRAPH,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
//TODO display with whitespace pre support
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: listBlockItems,
|
||||
type: LIST_BLOCK,
|
||||
type: ElementType.LIST,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
}));
|
||||
|
@ -60,7 +60,6 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
||||
if (item.type) {
|
||||
return !item.type.mergeToBlock;
|
||||
} else {
|
||||
console.debug(item);
|
||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||
return shouldSplit(lastItem, item, minX, mostUsedDistance);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextPageView from '../../components/debug/TextPageView.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { blockToText } from '../MarkdownElements.jsx';
|
||||
import { blockToText } from '../ElementType.jsx';
|
||||
|
||||
export default class ToTextBlocks extends Transformation {
|
||||
|
||||
@ -18,7 +18,8 @@ export default class ToTextBlocks extends Transformation {
|
||||
parseResult.pages.forEach(page => {
|
||||
const textItems = [];
|
||||
page.items.forEach(block => {
|
||||
const category = block.type ? block.type : 'Unknown';
|
||||
//TODO category to type (before have no unknowns, have paragraph)
|
||||
const category = block.type ? block.type.name : 'Unknown';
|
||||
textItems.push({
|
||||
category: category,
|
||||
text: blockToText(block)
|
||||
|
Loading…
Reference in New Issue
Block a user