mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 18:11:54 +02:00
[WIP] remove MarkdownElement in favor of ElementType enum
This commit is contained in:
parent
15c5946073
commit
f8fecc4c1d
@ -1,37 +1,86 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
|
import TextItem from './TextItem.jsx';
|
||||||
|
import TextItemBlock from './TextItemBlock.jsx';
|
||||||
|
|
||||||
export default class ElementType extends Enum {
|
export default class ElementType extends Enum {
|
||||||
}
|
}
|
||||||
|
|
||||||
ElementType.initEnum({
|
ElementType.initEnum({
|
||||||
H1: {
|
H1: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '# ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
H2: {
|
H2: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '## ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
H3: {
|
H3: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '### ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
H4: {
|
H4: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '#### ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
H5: {
|
H5: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '##### ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
H6: {
|
H6: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '###### ' + concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
TOC: {
|
TOC: {
|
||||||
mergeToBlock: true
|
mergeToBlock: true,
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
FOOTNOTES: {
|
FOOTNOTES: {
|
||||||
mergeToBlock: true,
|
mergeToBlock: true,
|
||||||
mergeFollowingNonTypedItems: true
|
mergeFollowingNonTypedItems: true,
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
CODE: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LIST: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PARAGRAPH: {
|
||||||
|
toText(block:TextItemBlock) {
|
||||||
|
return concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
//export default ElementType
|
export function blockToText(block: TextItemBlock) {
|
||||||
|
if (!block.type) {
|
||||||
|
return concatTextItems(block.textItems);
|
||||||
|
}
|
||||||
|
console.debug(block.type);
|
||||||
|
return block.type.toText(block);
|
||||||
|
}
|
||||||
|
|
||||||
|
function concatTextItems(textItems: TextItem[]) {
|
||||||
|
var text = '';
|
||||||
|
textItems.forEach(item => {
|
||||||
|
text += item.text + '\n';
|
||||||
|
});
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
export function headlineByLevel(level) {
|
export function headlineByLevel(level) {
|
||||||
if (level == 1) {
|
if (level == 1) {
|
||||||
|
@ -1,75 +0,0 @@
|
|||||||
import TextItemBlock from './TextItemBlock.jsx';
|
|
||||||
import TextItemCombiner from './TextItemCombiner.jsx';
|
|
||||||
import TextItem from './TextItem.jsx';
|
|
||||||
|
|
||||||
export const HEADLINE1 = "Headline 1";
|
|
||||||
export const HEADLINE2 = "Headline 2";
|
|
||||||
export const HEADLINE3 = "Headline 3";
|
|
||||||
export const HEADLINE4 = "Headline 4";
|
|
||||||
export const HEADLINE5 = "Headline 5";
|
|
||||||
export const HEADLINE6 = "Headline 6";
|
|
||||||
export const PARAGRAPH = "Paragraph";
|
|
||||||
export const LIST_BLOCK = "List";
|
|
||||||
export const CODE_BLOCK = "Code/Quote";
|
|
||||||
export const TOC_BLOCK = "TOC";
|
|
||||||
export const FOOTNOTE_BLOCK = "Footnotes"
|
|
||||||
|
|
||||||
export function headlineByLevel(level) {
|
|
||||||
if (level == 1) {
|
|
||||||
return HEADLINE1;
|
|
||||||
} else if (level == 2) {
|
|
||||||
return HEADLINE2;
|
|
||||||
} else if (level == 3) {
|
|
||||||
return HEADLINE3;
|
|
||||||
} else if (level == 4) {
|
|
||||||
return HEADLINE4;
|
|
||||||
} else if (level == 5) {
|
|
||||||
return HEADLINE5;
|
|
||||||
} else if (level == 6) {
|
|
||||||
return HEADLINE6;
|
|
||||||
}
|
|
||||||
throw "Unsupported headline level: " + level;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function blockToText(block: TextItemBlock) {
|
|
||||||
switch (block.type) {
|
|
||||||
case CODE_BLOCK:
|
|
||||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
|
||||||
case TOC_BLOCK:
|
|
||||||
var text = '';
|
|
||||||
//TODO real links
|
|
||||||
//TODO de-duplicate with DetectLists ?
|
|
||||||
block.textItems.forEach(item => {
|
|
||||||
text += item.text + '\n';
|
|
||||||
});
|
|
||||||
return text;
|
|
||||||
case HEADLINE1:
|
|
||||||
return '# ' + concatTextItems(block.textItems);
|
|
||||||
case HEADLINE2:
|
|
||||||
return '## ' + concatTextItems(block.textItems);
|
|
||||||
case HEADLINE3:
|
|
||||||
return '### ' + concatTextItems(block.textItems);
|
|
||||||
case HEADLINE4:
|
|
||||||
return '#### ' + concatTextItems(block.textItems);
|
|
||||||
case HEADLINE5:
|
|
||||||
return '##### ' + concatTextItems(block.textItems);
|
|
||||||
case HEADLINE6:
|
|
||||||
return '###### ' + concatTextItems(block.textItems);
|
|
||||||
default:
|
|
||||||
var textItems = block.textItems;
|
|
||||||
if (!block.type) {
|
|
||||||
//TODO mostUsedDistance
|
|
||||||
textItems = new TextItemCombiner({}).combine(textItems).textItems;
|
|
||||||
}
|
|
||||||
return concatTextItems(textItems);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function concatTextItems(textItems: TextItem[]) {
|
|
||||||
var text = '';
|
|
||||||
textItems.forEach(item => {
|
|
||||||
text += item.text + '\n';
|
|
||||||
});
|
|
||||||
return text;
|
|
||||||
}
|
|
@ -3,7 +3,7 @@ import ParseResult from '../ParseResult.jsx';
|
|||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
import TextItemBlock from '../TextItemBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { CODE_BLOCK } from '../MarkdownElements.jsx';
|
import ElementType from '../ElementType.jsx';
|
||||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||||
|
|
||||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||||
@ -55,7 +55,7 @@ export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
|
|||||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
||||||
} else {
|
} else {
|
||||||
preceedingCodeBlock = new TextItemBlock({
|
preceedingCodeBlock = new TextItemBlock({
|
||||||
type: CODE_BLOCK,
|
type: ElementType.CODE,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
textItems: combineResult.textItems,
|
textItems: combineResult.textItems,
|
||||||
parsedElements: combineResult.parsedElements
|
parsedElements: combineResult.parsedElements
|
||||||
|
@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx';
|
|||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
import TextItemBlock from '../TextItemBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
import ElementType from '../ElementType.jsx';
|
||||||
|
import { headlineByLevel } from '../ElementType.jsx';
|
||||||
|
|
||||||
//Detect headlines
|
//Detect headlines
|
||||||
export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||||
@ -157,9 +158,9 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
|||||||
block.annotation = REMOVED_ANNOTATION;
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
const combineResult = textCombiner.combine(block.textItems);
|
||||||
if (height == maxHeight) {
|
if (height == maxHeight) {
|
||||||
addNewBlock(newBlocks, combineResult, HEADLINE1);
|
addNewBlock(newBlocks, combineResult, ElementType.H1);
|
||||||
} else if (combineResult.textItems.length == 1) {
|
} else if (combineResult.textItems.length == 1) {
|
||||||
addNewBlock(newBlocks, combineResult, HEADLINE2);
|
addNewBlock(newBlocks, combineResult, ElementType.H2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -4,7 +4,7 @@ import TextItem from '../TextItem.jsx';
|
|||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
import TextItemBlock from '../TextItemBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
|
import ElementType from '../ElementType.jsx';
|
||||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||||
|
|
||||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||||
@ -83,14 +83,14 @@ export default class DetectLists extends ToTextItemBlockTransformation {
|
|||||||
if (itemsBeforeFirstLineItem.length > 0) {
|
if (itemsBeforeFirstLineItem.length > 0) {
|
||||||
newBlocks.push(new TextItemBlock({
|
newBlocks.push(new TextItemBlock({
|
||||||
textItems: itemsBeforeFirstLineItem,
|
textItems: itemsBeforeFirstLineItem,
|
||||||
type: PARAGRAPH,
|
type: ElementType.PARAGRAPH,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
//TODO display with whitespace pre support
|
//TODO display with whitespace pre support
|
||||||
newBlocks.push(new TextItemBlock({
|
newBlocks.push(new TextItemBlock({
|
||||||
textItems: listBlockItems,
|
textItems: listBlockItems,
|
||||||
type: LIST_BLOCK,
|
type: ElementType.LIST,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
parsedElements: combineResult.parsedElements
|
parsedElements: combineResult.parsedElements
|
||||||
}));
|
}));
|
||||||
|
@ -60,7 +60,6 @@ function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
|||||||
if (item.type) {
|
if (item.type) {
|
||||||
return !item.type.mergeToBlock;
|
return !item.type.mergeToBlock;
|
||||||
} else {
|
} else {
|
||||||
console.debug(item);
|
|
||||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||||
return shouldSplit(lastItem, item, minX, mostUsedDistance);
|
return shouldSplit(lastItem, item, minX, mostUsedDistance);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ import React from 'react';
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import TextPageView from '../../components/debug/TextPageView.jsx';
|
import TextPageView from '../../components/debug/TextPageView.jsx';
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import { blockToText } from '../MarkdownElements.jsx';
|
import { blockToText } from '../ElementType.jsx';
|
||||||
|
|
||||||
export default class ToTextBlocks extends Transformation {
|
export default class ToTextBlocks extends Transformation {
|
||||||
|
|
||||||
@ -18,7 +18,8 @@ export default class ToTextBlocks extends Transformation {
|
|||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
const textItems = [];
|
const textItems = [];
|
||||||
page.items.forEach(block => {
|
page.items.forEach(block => {
|
||||||
const category = block.type ? block.type : 'Unknown';
|
//TODO category to type (before have no unknowns, have paragraph)
|
||||||
|
const category = block.type ? block.type.name : 'Unknown';
|
||||||
textItems.push({
|
textItems.push({
|
||||||
category: category,
|
category: category,
|
||||||
text: blockToText(block)
|
text: blockToText(block)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user