mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
rename element type to block type
This commit is contained in:
parent
c4679238cd
commit
a0c5bb29d6
@ -1,14 +1,14 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
import { linesToText } from './markdown/WordType.jsx';
|
import { linesToText } from './WordType.jsx';
|
||||||
import LineItemBlock from './LineItemBlock.jsx';
|
import LineItemBlock from '../LineItemBlock.jsx';
|
||||||
|
|
||||||
// An Markdown element
|
// An Markdown block
|
||||||
export default class ElementType extends Enum {
|
export default class BlockType extends Enum {
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO rename to BlockType
|
//TODO rename to BlockType
|
||||||
|
|
||||||
ElementType.initEnum({
|
BlockType.initEnum({
|
||||||
H1: {
|
H1: {
|
||||||
headline: true,
|
headline: true,
|
||||||
headlineLevel: 1,
|
headlineLevel: 1,
|
||||||
@ -84,8 +84,8 @@ ElementType.initEnum({
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
export function isHeadline(elementType: ElementType) {
|
export function isHeadline(type: BlockType) {
|
||||||
return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
|
return type && type.name.length == 2 && type.name[0] === 'H'
|
||||||
}
|
}
|
||||||
|
|
||||||
export function blockToText(block: LineItemBlock) {
|
export function blockToText(block: LineItemBlock) {
|
||||||
@ -97,17 +97,17 @@ export function blockToText(block: LineItemBlock) {
|
|||||||
|
|
||||||
export function headlineByLevel(level) {
|
export function headlineByLevel(level) {
|
||||||
if (level == 1) {
|
if (level == 1) {
|
||||||
return ElementType.H1;
|
return BlockType.H1;
|
||||||
} else if (level == 2) {
|
} else if (level == 2) {
|
||||||
return ElementType.H2;
|
return BlockType.H2;
|
||||||
} else if (level == 3) {
|
} else if (level == 3) {
|
||||||
return ElementType.H3;
|
return BlockType.H3;
|
||||||
} else if (level == 4) {
|
} else if (level == 4) {
|
||||||
return ElementType.H4;
|
return BlockType.H4;
|
||||||
} else if (level == 5) {
|
} else if (level == 5) {
|
||||||
return ElementType.H5;
|
return BlockType.H5;
|
||||||
} else if (level == 6) {
|
} else if (level == 6) {
|
||||||
return ElementType.H6;
|
return BlockType.H6;
|
||||||
}
|
}
|
||||||
throw "Unsupported headline level: " + level + " (supported are 1-6)";
|
throw "Unsupported headline level: " + level + " (supported are 1-6)";
|
||||||
}
|
}
|
@ -2,7 +2,7 @@ import React from 'react';
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import TextPageView from '../../components/debug/TextPageView.jsx';
|
import TextPageView from '../../components/debug/TextPageView.jsx';
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import { blockToText } from '../ElementType.jsx';
|
import { blockToText } from '../markdown/BlockType.jsx';
|
||||||
|
|
||||||
export default class ToTextBlocks extends Transformation {
|
export default class ToTextBlocks extends Transformation {
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import ParseResult from '../../ParseResult.jsx';
|
|||||||
import LineItem from '../../LineItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
|
||||||
import LineConverter from '../../LineConverter.jsx';
|
import LineConverter from '../../LineConverter.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
|
|
||||||
@ -59,7 +59,7 @@ export default class CompactLines extends ToLineItemTransformation {
|
|||||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||||
}
|
}
|
||||||
if (lineItem.parsedElements.footnotes.length > 0) {
|
if (lineItem.parsedElements.footnotes.length > 0) {
|
||||||
lineItem.type = ElementType.FOOTNOTES;
|
lineItem.type = BlockType.FOOTNOTES;
|
||||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
||||||
import { isListItem } from '../../../stringFunctions.jsx';
|
import { isListItem } from '../../../stringFunctions.jsx';
|
||||||
|
|
||||||
//Detect headlines based on heights
|
//Detect headlines based on heights
|
||||||
@ -25,9 +25,9 @@ export default class DetectHeaders extends ToLineItemTransformation {
|
|||||||
const height = item.height;
|
const height = item.height;
|
||||||
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||||
if (height == maxHeight) {
|
if (height == maxHeight) {
|
||||||
item.type = ElementType.H1;
|
item.type = BlockType.H1;
|
||||||
} else {
|
} else {
|
||||||
item.type = ElementType.H2;
|
item.type = BlockType.H2;
|
||||||
}
|
}
|
||||||
item.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
detectedHeaders++;
|
detectedHeaders++;
|
||||||
@ -44,7 +44,7 @@ export default class DetectHeaders extends ToLineItemTransformation {
|
|||||||
page.items.forEach(item => {
|
page.items.forEach(item => {
|
||||||
if (!item.type && item.height == range.max) {
|
if (!item.type && item.height == range.max) {
|
||||||
item.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
item.type = ElementType.enumValueOf(headlineType);
|
item.type = BlockType.enumValueOf(headlineType);
|
||||||
detectedHeaders++
|
detectedHeaders++
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -3,7 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
|
|||||||
import LineItem from '../../LineItem.jsx';
|
import LineItem from '../../LineItem.jsx';
|
||||||
import Word from '../../Word.jsx';
|
import Word from '../../Word.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx';
|
import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx';
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect items starting with -, •, etc...
|
||||||
@ -26,7 +26,7 @@ export default class DetectListItems extends ToLineItemTransformation {
|
|||||||
foundListItems++
|
foundListItems++
|
||||||
if (item.words[0].string === '-') {
|
if (item.words[0].string === '-') {
|
||||||
item.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
item.type = ElementType.LIST;
|
item.type = BlockType.LIST;
|
||||||
} else {
|
} else {
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
const newWords = item.words.map(word => new Word({
|
const newWords = item.words.map(word => new Word({
|
||||||
@ -37,13 +37,13 @@ export default class DetectListItems extends ToLineItemTransformation {
|
|||||||
...item,
|
...item,
|
||||||
words: newWords,
|
words: newWords,
|
||||||
annotation: ADDED_ANNOTATION,
|
annotation: ADDED_ANNOTATION,
|
||||||
type: ElementType.LIST
|
type: BlockType.LIST
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
} else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
|
} else if (isNumberedListItem(text)) { //TODO check that starts with 1 (kala chakra)
|
||||||
foundNumberedItems++;
|
foundNumberedItems++;
|
||||||
item.annotation = DETECTED_ANNOTATION;
|
item.annotation = DETECTED_ANNOTATION;
|
||||||
item.type = ElementType.LIST;
|
item.type = BlockType.LIST;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -4,8 +4,8 @@ import LineItem from '../../LineItem.jsx';
|
|||||||
import Word from '../../Word.jsx';
|
import Word from '../../Word.jsx';
|
||||||
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
import HeadlineFinder from '../../HeadlineFinder.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
import { headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
||||||
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
|
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages plus linked headlines
|
//Detect table of contents pages plus linked headlines
|
||||||
@ -90,7 +90,7 @@ export default class DetectTOC extends ToLineItemTransformation {
|
|||||||
if (line === headlineItem) {
|
if (line === headlineItem) {
|
||||||
newBlocks.push(new LineItem({
|
newBlocks.push(new LineItem({
|
||||||
...line,
|
...line,
|
||||||
type: ElementType.H2,
|
type: BlockType.H2,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
@ -114,7 +114,7 @@ export default class DetectTOC extends ToLineItemTransformation {
|
|||||||
words: [new Word({
|
words: [new Word({
|
||||||
string: ' '.repeat(tocLink.level * 3) + '-'
|
string: ' '.repeat(tocLink.level * 3) + '-'
|
||||||
})].concat(tocLink.lineItem.words),
|
})].concat(tocLink.lineItem.words),
|
||||||
type: ElementType.TOC,
|
type: BlockType.TOC,
|
||||||
annotation: ADDED_ANNOTATION
|
annotation: ADDED_ANNOTATION
|
||||||
}));
|
}));
|
||||||
});
|
});
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
|
import { minXFromBlocks } from '../../../pageItemFunctions.jsx';
|
||||||
|
|
||||||
//Detect items which are code/quote blocks
|
//Detect items which are code/quote blocks
|
||||||
@ -19,7 +19,7 @@ export default class DetectCodeQuoteBlocks extends ToLineItemBlockTransformation
|
|||||||
page.items.forEach(block => {
|
page.items.forEach(block => {
|
||||||
if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
|
if (!block.type && looksLikeCodeBlock(minX, block.items, mostUsedHeight)) {
|
||||||
block.annotation = DETECTED_ANNOTATION;
|
block.annotation = DETECTED_ANNOTATION;
|
||||||
block.type = ElementType.CODE;
|
block.type = BlockType.CODE;
|
||||||
foundCodeItems++;
|
foundCodeItems++;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -2,7 +2,7 @@ import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx'
|
|||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import Word from '../../Word.jsx';
|
import Word from '../../Word.jsx';
|
||||||
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
import { MODIFIED_ANNOTATION, UNCHANGED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import ElementType from '../../ElementType.jsx';
|
import BlockType from '../../markdown/BlockType.jsx';
|
||||||
|
|
||||||
// Cares for proper sub-item spacing/leveling
|
// Cares for proper sub-item spacing/leveling
|
||||||
export default class DetectListLevels extends ToLineItemBlockTransformation {
|
export default class DetectListLevels extends ToLineItemBlockTransformation {
|
||||||
@ -17,7 +17,7 @@ export default class DetectListLevels extends ToLineItemBlockTransformation {
|
|||||||
var modifiedBlocks = 0;
|
var modifiedBlocks = 0;
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
|
|
||||||
page.items.filter(block => block.type === ElementType.LIST).forEach(listBlock => {
|
page.items.filter(block => block.type === BlockType.LIST).forEach(listBlock => {
|
||||||
var lastItemX;
|
var lastItemX;
|
||||||
var currentLevel = 0;
|
var currentLevel = 0;
|
||||||
const xByLevel = {};
|
const xByLevel = {};
|
||||||
|
Loading…
Reference in New Issue
Block a user