mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
rename element type to block type
This commit is contained in:
parent
c4679238cd
commit
a0c5bb29d6
@ -1,14 +1,14 @@
|
||||
import { Enum } from 'enumify';
|
||||
import { linesToText } from './markdown/WordType.jsx';
|
||||
import LineItemBlock from './LineItemBlock.jsx';
|
||||
import { linesToText } from './WordType.jsx';
|
||||
import LineItemBlock from '../LineItemBlock.jsx';
|
||||
|
||||
// An Markdown element
|
||||
export default class ElementType extends Enum {
|
||||
// An Markdown block
|
||||
export default class BlockType extends Enum {
|
||||
}
|
||||
|
||||
//TODO rename to BlockType
|
||||
|
||||
ElementType.initEnum({
|
||||
BlockType.initEnum({
|
||||
H1: {
|
||||
headline: true,
|
||||
headlineLevel: 1,
|
||||
@ -84,8 +84,8 @@ ElementType.initEnum({
|
||||
}
|
||||
});
|
||||
|
||||
export function isHeadline(elementType: ElementType) {
|
||||
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
||||
export function isHeadline(type: BlockType) {
|
||||
return type && type.name.length == 2 && type.name[0] === 'H'
|
||||
}
|
||||
|
||||
export function blockToText(block: LineItemBlock) {
|
||||
@ -97,17 +97,17 @@ export function blockToText(block: LineItemBlock) {
|
||||
|
||||
export function headlineByLevel(level) {
|
||||
if (level == 1) {
|
||||
return ElementType.H1;
|
||||
return BlockType.H1;
|
||||
} else if (level == 2) {
|
||||
return ElementType.H2;
|
||||
return BlockType.H2;
|
||||
} else if (level == 3) {
|
||||
return ElementType.H3;
|
||||
return BlockType.H3;
|
||||
} else if (level == 4) {
|
||||
return ElementType.H4;
|
||||
return BlockType.H4;
|
||||
} else if (level == 5) {
|
||||
return ElementType.H5;
|
||||
return BlockType.H5;
|
||||
} else if (level == 6) {
|
||||
return ElementType.H6;
|
||||
return BlockType.H6;
|
||||
}
|
||||
throw "Unsupported headline level: " + level + " (supported are 1-6)";
|
||||
}
|
@ -2,7 +2,7 @@ import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextPageView from '../../components/debug/TextPageView.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { blockToText } from '../ElementType.jsx';
|
||||
import { blockToText } from '../markdown/BlockType.jsx';
|
||||
|
||||
export default class ToTextBlocks extends Transformation {
|
||||
|
||||
|
@ -5,7 +5,7 @@ import ParseResult from '../../ParseResult.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
||||
import LineConverter from '../../LineConverter.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ export default class CompactLines extends ToLineItemTransformation {
|
||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||
}
|
||||
if (lineItem.parsedElements.footnotes.length > 0) {
|
||||
lineItem.type = ElementType.FOOTNOTES;
|
||||
lineItem.type = BlockType.FOOTNOTES;
|
||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
||||
import { isListItem } from '../../../stringFunctions.jsx';
|
||||
|
||||
//Detect headlines based on heights
|
||||
@ -25,9 +25,9 @@ export default class DetectHeaders extends ToLineItemTransformation {
|
||||
const height = item.height;
|
||||
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
if (height == maxHeight) {
|
||||
item.type = ElementType.H1;
|
||||
item.type = BlockType.H1;
|
||||
} else {
|
||||
item.type = ElementType.H2;
|
||||
item.type = BlockType.H2;
|
||||
}
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
detectedHeaders++;
|
||||
@ -44,7 +44,7 @@ export default class DetectHeaders extends ToLineItemTransformation {
|
||||
page.items.forEach(item => {
|
||||
if (!item.type && item.height == range.max) {
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.enumValueOf(headlineType);
|
||||
item.type = BlockType.enumValueOf(headlineType);
|
||||
detectedHeaders++
|
||||
}
|
||||
});
|
||||
|
@ -3,7 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
|
||||
import LineItem from '../../LineItem.jsx';
|
||||
import Word from '../../Word.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
@ -26,7 +26,7 @@ export default class DetectListItems extends ToLineItemTransformation {
|
||||
foundListItems++
|
||||
if (item.words[0].string === '-') {
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.LIST;
|
||||
item.type = BlockType.LIST;
|
||||
} else {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
const newWords = item.words.map(word => new Word({
|
||||
@ -37,13 +37,13 @@ export default class DetectListItems extends ToLineItemTransformation {
|
||||
...item,
|
||||
words: newWords,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
type: ElementType.LIST
|
||||
type: BlockType.LIST
|
||||
}));
|
||||
}
|
||||
} else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
|
||||
foundNumberedItems++;
|
||||
item.annotation = DETECTED_ANNOTATION;
|
||||
item.type = ElementType.LIST;
|
||||
item.type = BlockType.LIST;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -4,8 +4,8 @@ import LineItem from '../../LineItem.jsx';
|
||||
import Word from '../../Word.jsx';
|
||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
||||
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
|
||||
|
||||
//Detect table of contents pages plus linked headlines
|
||||
@ -90,7 +90,7 @@ export default class DetectTOC extends ToLineItemTransformation {
|
||||
if (line === headlineItem) {
|
||||
newBlocks.push(new LineItem({
|
||||
...line,
|
||||
type: ElementType.H2,
|
||||
type: BlockType.H2,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
@ -114,7 +114,7 @@ export default class DetectTOC extends ToLineItemTransformation {
|
||||
words: [new Word({
|
||||
string: ' '.repeat(tocLink.level * 3) + '-'
|
||||
})].concat(tocLink.lineItem.words),
|
||||
type: ElementType.TOC,
|
||||
type: BlockType.TOC,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
});
|
||||
|
@ -1,7 +1,7 @@
|
||||
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
|
||||
|
||||
//Detect items which are code/quote blocks
|
||||
@ -19,7 +19,7 @@ export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
|
||||
block.annotation = DETECTED_ANNOTATION;
|
||||
block.type = ElementType.CODE;
|
||||
block.type = BlockType.CODE;
|
||||
foundCodeItems++;
|
||||
}
|
||||
});
|
||||
|
@ -2,7 +2,7 @@ import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx'
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import Word from '../../Word.jsx';
|
||||
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import BlockType from '../../markdown/BlockType.jsx';
|
||||
|
||||
// Cares for proper sub-item spacing/leveling
|
||||
export default class DetectListLevels extends ToLineItemBlockTransformation {
|
||||
@ -17,7 +17,7 @@ export default class DetectListLevels extends ToLineItemBlockTransformation {
|
||||
var modifiedBlocks = 0;
|
||||
parseResult.pages.forEach(page => {
|
||||
|
||||
page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
|
||||
page.items.filter(block => block.type === BlockType.LIST).forEach(listBlock => {
|
||||
var lastItemX;
|
||||
var currentLevel = 0;
|
||||
const xByLevel = {};
|
||||
|
Loading…
Reference in New Issue
Block a user