mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-24 19:41:24 +02:00
[WIP] Simplify code/quote detection
This commit is contained in:
parent
c6f592d3fc
commit
5caf8154db
@ -56,7 +56,7 @@ export function removeLeadingWhitespaces(string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function isListItem(string) {
|
export function isListItem(string) {
|
||||||
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
|
return /^[\s]*[-•][\s].*$/g.test(string);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isNumberedListItem(string) {
|
export function isNumberedListItem(string) {
|
||||||
|
@ -8,16 +8,14 @@ import DetectTOC from './transformations/DetectTOC.jsx'
|
|||||||
import DetectListItems from './transformations/DetectListItems.jsx'
|
import DetectListItems from './transformations/DetectListItems.jsx'
|
||||||
|
|
||||||
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
||||||
|
import DetectCodeQuoteBlocks from './transformations/DetectCodeQuoteBlocks.jsx'
|
||||||
import DetectListLevels from './transformations/DetectListLevels.jsx'
|
import DetectListLevels from './transformations/DetectListLevels.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
|
||||||
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
// import CombineSameY from './transformations/CombineSameY.jsx';
|
|
||||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
// import DetectLinks from './transformations/DetectLinks.jsx'
|
// import DetectLinks from './transformations/DetectLinks.jsx'
|
||||||
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||||
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
|
||||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||||
|
|
||||||
@ -38,17 +36,15 @@ export default class AppState {
|
|||||||
new DetectListItems(),
|
new DetectListItems(),
|
||||||
|
|
||||||
new GatherBlocks(),
|
new GatherBlocks(),
|
||||||
|
new DetectCodeQuoteBlocks(),
|
||||||
new DetectListLevels(),
|
new DetectListLevels(),
|
||||||
new DetectCodeBlocks(),
|
|
||||||
new DetectHeadlines(),
|
new DetectHeadlines(),
|
||||||
|
|
||||||
// new DetectFormats(),
|
// new DetectFormats(),
|
||||||
// new CombineSameY(),
|
|
||||||
// new RemoveWhitespaces(),
|
// new RemoveWhitespaces(),
|
||||||
// new DetectLinks(),
|
// new DetectLinks(),
|
||||||
// new HeadlineDetector(),
|
// new HeadlineDetector(),
|
||||||
// new HeadlineToUppercase(),
|
// new HeadlineToUppercase(),
|
||||||
// new ToBlockSystem(),
|
|
||||||
new ToTextBlocks(),
|
new ToTextBlocks(),
|
||||||
new ToMarkdown()];
|
new ToMarkdown()];
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@ ElementType.initEnum({
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
CODE: {
|
CODE: {
|
||||||
|
mergeToBlock: true,
|
||||||
toText(block:TextItemBlock) {
|
toText(block:TextItemBlock) {
|
||||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||||
}
|
}
|
||||||
|
@ -1,84 +0,0 @@
|
|||||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
|
||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
|
||||||
import ElementType from '../ElementType.jsx';
|
|
||||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
|
||||||
|
|
||||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
|
||||||
export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Detect Code/Quotes");
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
|
||||||
|
|
||||||
var foundBlocks = 0;
|
|
||||||
const textCombiner = new TextItemCombiner({
|
|
||||||
mostUsedDistance: mostUsedDistance
|
|
||||||
});
|
|
||||||
|
|
||||||
parseResult.pages.forEach(page => {
|
|
||||||
var minX = minXFromBlocks(page.items);
|
|
||||||
if (minX) {
|
|
||||||
const itemAreSuitable = (items) => {
|
|
||||||
for ( let item of items ) {
|
|
||||||
if (item.x == minX) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (item.height > mostUsedHeight + 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
const newBlocks = [];
|
|
||||||
var preceedingCodeBlock;
|
|
||||||
page.items.forEach(block => {
|
|
||||||
if (block.type) {
|
|
||||||
newBlocks.push(block);
|
|
||||||
preceedingCodeBlock = null;
|
|
||||||
} else {
|
|
||||||
if (itemAreSuitable(block.textItems)) {
|
|
||||||
const mergeWithPreceedingCodeBlock = preceedingCodeBlock && preceedingCodeBlock.textItems[preceedingCodeBlock.textItems.length - 1].y - block.textItems[0].y < mostUsedDistance * 2;
|
|
||||||
if (mergeWithPreceedingCodeBlock) {
|
|
||||||
newBlocks.pop();
|
|
||||||
}
|
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
|
||||||
newBlocks.push(block);
|
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
|
||||||
if (mergeWithPreceedingCodeBlock) {
|
|
||||||
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
|
|
||||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
|
||||||
} else {
|
|
||||||
preceedingCodeBlock = new TextItemBlock({
|
|
||||||
type: ElementType.CODE,
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
textItems: combineResult.textItems,
|
|
||||||
parsedElements: combineResult.parsedElements
|
|
||||||
});
|
|
||||||
foundBlocks++;
|
|
||||||
}
|
|
||||||
newBlocks.push(preceedingCodeBlock);
|
|
||||||
} else {
|
|
||||||
newBlocks.push(block);
|
|
||||||
preceedingCodeBlock = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
page.items = newBlocks;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return new ParseResult({
|
|
||||||
...parseResult,
|
|
||||||
messages: ['Detected ' + foundBlocks + ' code/quote blocks.']
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,52 @@
|
|||||||
|
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import { DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
import ElementType from '../ElementType.jsx';
|
||||||
|
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||||
|
|
||||||
|
//Detect items which are code/quote blocks
|
||||||
|
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Code/Quote Blocks");
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
const {mostUsedHeight} = parseResult.globals;
|
||||||
|
var foundCodeItems = 0;
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
var minX = minXFromBlocks(page.items);
|
||||||
|
page.items.forEach(block => {
|
||||||
|
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
|
||||||
|
block.annotation = DETECTED_ANNOTATION;
|
||||||
|
block.type = ElementType.CODE;
|
||||||
|
foundCodeItems++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
messages: [
|
||||||
|
'Detected ' + foundCodeItems + ' code/quote items.',
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
|
||||||
|
if (textItems.length == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (textItems.length == 1) {
|
||||||
|
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
|
||||||
|
}
|
||||||
|
for ( var item of textItems ) {
|
||||||
|
if (item.x == minX) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
@ -77,7 +77,7 @@ function bigDistance(lastItem, item, minX, mostUsedDistance) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
var allowedDisctance = mostUsedDistance + 1;
|
var allowedDisctance = mostUsedDistance + 1;
|
||||||
if (lastItem.x == item.x && item.x > minX) {
|
if (lastItem.x > minX && item.x > minX) {
|
||||||
//intended elements like lists often have greater spacing
|
//intended elements like lists often have greater spacing
|
||||||
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
||||||
}
|
}
|
||||||
|
@ -98,6 +98,8 @@ describe('isListItem', () => {
|
|||||||
|
|
||||||
it('Match', () => {
|
it('Match', () => {
|
||||||
expect(isListItem('- my text')).to.equal(true);
|
expect(isListItem('- my text')).to.equal(true);
|
||||||
|
expect(isListItem('- my text -')).to.equal(true);
|
||||||
|
expect(isListItem(' - my text')).to.equal(true);
|
||||||
expect(isListItem(' - my text')).to.equal(true);
|
expect(isListItem(' - my text')).to.equal(true);
|
||||||
expect(isListItem(' - my text')).to.equal(true);
|
expect(isListItem(' - my text')).to.equal(true);
|
||||||
|
|
||||||
@ -111,8 +113,6 @@ describe('isListItem', () => {
|
|||||||
expect(isListItem('-my text')).to.equal(false);
|
expect(isListItem('-my text')).to.equal(false);
|
||||||
expect(isListItem('•my text')).to.equal(false);
|
expect(isListItem('•my text')).to.equal(false);
|
||||||
expect(isListItem(' -my text')).to.equal(false);
|
expect(isListItem(' -my text')).to.equal(false);
|
||||||
expect(isListItem('- my text -')).to.equal(false);
|
|
||||||
expect(isListItem('• my text •')).to.equal(false);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user