mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-16 18:48:55 +01:00
[WIP] Simplify code/quote detection
This commit is contained in:
parent
c6f592d3fc
commit
5caf8154db
@ -56,7 +56,7 @@ export function removeLeadingWhitespaces(string) {
|
||||
}
|
||||
|
||||
export function isListItem(string) {
|
||||
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
|
||||
return /^[\s]*[-•][\s].*$/g.test(string);
|
||||
}
|
||||
|
||||
export function isNumberedListItem(string) {
|
||||
|
@ -8,16 +8,14 @@ import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/DetectListItems.jsx'
|
||||
|
||||
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
||||
import DetectCodeQuoteBlocks from './transformations/DetectCodeQuoteBlocks.jsx'
|
||||
import DetectListLevels from './transformations/DetectListLevels.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
// import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
// import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
|
||||
@ -38,17 +36,15 @@ export default class AppState {
|
||||
new DetectListItems(),
|
||||
|
||||
new GatherBlocks(),
|
||||
new DetectCodeQuoteBlocks(),
|
||||
new DetectListLevels(),
|
||||
new DetectCodeBlocks(),
|
||||
new DetectHeadlines(),
|
||||
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
// new RemoveWhitespaces(),
|
||||
// new DetectLinks(),
|
||||
// new HeadlineDetector(),
|
||||
// new HeadlineToUppercase(),
|
||||
// new ToBlockSystem(),
|
||||
new ToTextBlocks(),
|
||||
new ToMarkdown()];
|
||||
|
||||
|
@ -50,6 +50,7 @@ ElementType.initEnum({
|
||||
}
|
||||
},
|
||||
CODE: {
|
||||
mergeToBlock: true,
|
||||
toText(block:TextItemBlock) {
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
}
|
||||
|
@ -1,84 +0,0 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code/Quotes");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
||||
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
if (minX) {
|
||||
const itemAreSuitable = (items) => {
|
||||
for ( let item of items ) {
|
||||
if (item.x == minX) {
|
||||
return false;
|
||||
}
|
||||
if (item.height > mostUsedHeight + 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
const newBlocks = [];
|
||||
var preceedingCodeBlock;
|
||||
page.items.forEach(block => {
|
||||
if (block.type) {
|
||||
newBlocks.push(block);
|
||||
preceedingCodeBlock = null;
|
||||
} else {
|
||||
if (itemAreSuitable(block.textItems)) {
|
||||
const mergeWithPreceedingCodeBlock = preceedingCodeBlock && preceedingCodeBlock.textItems[preceedingCodeBlock.textItems.length - 1].y - block.textItems[0].y < mostUsedDistance * 2;
|
||||
if (mergeWithPreceedingCodeBlock) {
|
||||
newBlocks.pop();
|
||||
}
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
newBlocks.push(block);
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (mergeWithPreceedingCodeBlock) {
|
||||
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
|
||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
||||
} else {
|
||||
preceedingCodeBlock = new TextItemBlock({
|
||||
type: ElementType.CODE,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
textItems: combineResult.textItems,
|
||||
parsedElements: combineResult.parsedElements
|
||||
});
|
||||
foundBlocks++;
|
||||
}
|
||||
newBlocks.push(preceedingCodeBlock);
|
||||
} else {
|
||||
newBlocks.push(block);
|
||||
preceedingCodeBlock = null;
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Detected ' + foundBlocks + ' code/quote blocks.']
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,52 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect items which are code/quote blocks
|
||||
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code/Quote Blocks");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedHeight} = parseResult.globals;
|
||||
var foundCodeItems = 0;
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
|
||||
block.annotation = DETECTED_ANNOTATION;
|
||||
block.type = ElementType.CODE;
|
||||
foundCodeItems++;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Detected ' + foundCodeItems + ' code/quote items.',
|
||||
]
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
|
||||
if (textItems.length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (textItems.length == 1) {
|
||||
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
|
||||
}
|
||||
for ( var item of textItems ) {
|
||||
if (item.x == minX) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
@ -77,7 +77,7 @@ function bigDistance(lastItem, item, minX, mostUsedDistance) {
|
||||
return true;
|
||||
}
|
||||
var allowedDisctance = mostUsedDistance + 1;
|
||||
if (lastItem.x == item.x && item.x > minX) {
|
||||
if (lastItem.x > minX && item.x > minX) {
|
||||
//intended elements like lists often have greater spacing
|
||||
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
||||
}
|
||||
|
@ -98,8 +98,10 @@ describe('isListItem', () => {
|
||||
|
||||
it('Match', () => {
|
||||
expect(isListItem('- my text')).to.equal(true);
|
||||
expect(isListItem('- my text -')).to.equal(true);
|
||||
expect(isListItem(' - my text')).to.equal(true);
|
||||
expect(isListItem(' - my text')).to.equal(true);
|
||||
expect(isListItem(' - my text')).to.equal(true);
|
||||
|
||||
expect(isListItem('• my text')).to.equal(true);
|
||||
expect(isListItem(' • my text')).to.equal(true);
|
||||
@ -111,8 +113,6 @@ describe('isListItem', () => {
|
||||
expect(isListItem('-my text')).to.equal(false);
|
||||
expect(isListItem('•my text')).to.equal(false);
|
||||
expect(isListItem(' -my text')).to.equal(false);
|
||||
expect(isListItem('- my text -')).to.equal(false);
|
||||
expect(isListItem('• my text •')).to.equal(false);
|
||||
});
|
||||
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user