[WIP] Simplify code/quote detection

This commit is contained in:
Johannes Zillmann 2017-03-14 10:30:21 +01:00
parent c6f592d3fc
commit 5caf8154db
7 changed files with 59 additions and 94 deletions

View File

@ -56,7 +56,7 @@ export function removeLeadingWhitespaces(string) {
}
export function isListItem(string) {
return /^[\s]*[-•][\s].*[^-•]$/g.test(string);
return /^[\s]*[-•][\s].*$/g.test(string);
}
export function isNumberedListItem(string) {

View File

@ -8,16 +8,14 @@ import DetectTOC from './transformations/DetectTOC.jsx'
import DetectListItems from './transformations/DetectListItems.jsx'
import GatherBlocks from './transformations/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/DetectListLevels.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
// import CombineSameY from './transformations/CombineSameY.jsx';
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
// import DetectLinks from './transformations/DetectLinks.jsx'
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -38,17 +36,15 @@ export default class AppState {
new DetectListItems(),
new GatherBlocks(),
new DetectCodeQuoteBlocks(),
new DetectListLevels(),
new DetectCodeBlocks(),
new DetectHeadlines(),
// new DetectFormats(),
// new CombineSameY(),
// new RemoveWhitespaces(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(),
// new ToBlockSystem(),
new ToTextBlocks(),
new ToMarkdown()];

View File

@ -50,6 +50,7 @@ ElementType.initEnum({
}
},
CODE: {
mergeToBlock: true,
toText(block:TextItemBlock) {
return '```\n' + concatTextItems(block.textItems) + '```'
}

View File

@ -1,84 +0,0 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Code/Quotes");
}
transform(parseResult:ParseResult) {
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
var foundBlocks = 0;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
if (minX) {
const itemAreSuitable = (items) => {
for ( let item of items ) {
if (item.x == minX) {
return false;
}
if (item.height > mostUsedHeight + 1) {
return false;
}
}
return true;
};
const newBlocks = [];
var preceedingCodeBlock;
page.items.forEach(block => {
if (block.type) {
newBlocks.push(block);
preceedingCodeBlock = null;
} else {
if (itemAreSuitable(block.textItems)) {
const mergeWithPreceedingCodeBlock = preceedingCodeBlock && preceedingCodeBlock.textItems[preceedingCodeBlock.textItems.length - 1].y - block.textItems[0].y < mostUsedDistance * 2;
if (mergeWithPreceedingCodeBlock) {
newBlocks.pop();
}
block.annotation = REMOVED_ANNOTATION;
newBlocks.push(block);
const combineResult = textCombiner.combine(block.textItems);
if (mergeWithPreceedingCodeBlock) {
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
} else {
preceedingCodeBlock = new TextItemBlock({
type: ElementType.CODE,
annotation: ADDED_ANNOTATION,
textItems: combineResult.textItems,
parsedElements: combineResult.parsedElements
});
foundBlocks++;
}
newBlocks.push(preceedingCodeBlock);
} else {
newBlocks.push(block);
preceedingCodeBlock = null;
}
}
});
page.items = newBlocks;
}
});
return new ParseResult({
...parseResult,
messages: ['Detected ' + foundBlocks + ' code/quote blocks.']
});
}
}

View File

@ -0,0 +1,52 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect items which are code/quote blocks
export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Code/Quote Blocks");
}
transform(parseResult:ParseResult) {
const {mostUsedHeight} = parseResult.globals;
var foundCodeItems = 0;
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
page.items.forEach(block => {
if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) {
block.annotation = DETECTED_ANNOTATION;
block.type = ElementType.CODE;
foundCodeItems++;
}
});
});
return new ParseResult({
...parseResult,
messages: [
'Detected ' + foundCodeItems + ' code/quote items.',
]
});
}
}
function looksLikeCodeBlock(minX, textItems, mostUsedHeight) {
if (textItems.length == 0) {
return false;
}
if (textItems.length == 1) {
return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1;
}
for ( var item of textItems ) {
if (item.x == minX) {
return false;
}
}
return true;
}

View File

@ -77,7 +77,7 @@ function bigDistance(lastItem, item, minX, mostUsedDistance) {
return true;
}
var allowedDisctance = mostUsedDistance + 1;
if (lastItem.x == item.x && item.x > minX) {
if (lastItem.x > minX && item.x > minX) {
//intended elements like lists often have greater spacing
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
}

View File

@ -97,6 +97,8 @@ describe('normalizedCharCodeArray', () => {
describe('isListItem', () => {
it('Match', () => {
expect(isListItem('- my text')).to.equal(true);
expect(isListItem('- my text -')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
expect(isListItem(' - my text')).to.equal(true);
@ -111,8 +113,6 @@ describe('isListItem', () => {
expect(isListItem('-my text')).to.equal(false);
expect(isListItem('•my text')).to.equal(false);
expect(isListItem(' -my text')).to.equal(false);
expect(isListItem('- my text -')).to.equal(false);
expect(isListItem('• my text •')).to.equal(false);
});
});