From 5caf8154dbd162faf5d4e4259ae2e1c103dc1dce Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Tue, 14 Mar 2017 10:30:21 +0100 Subject: [PATCH] [WIP] Simplify code/quote detection --- src/javascript/functions.jsx | 2 +- src/javascript/models/AppState.jsx | 8 +- src/javascript/models/ElementType.jsx | 1 + .../transformations/DetectCodeBlocks.jsx | 84 ------------------- .../transformations/DetectCodeQuoteBlocks.jsx | 52 ++++++++++++ .../models/transformations/GatherBlocks.jsx | 2 +- test/functions.spec.js | 4 +- 7 files changed, 59 insertions(+), 94 deletions(-) delete mode 100644 src/javascript/models/transformations/DetectCodeBlocks.jsx create mode 100644 src/javascript/models/transformations/DetectCodeQuoteBlocks.jsx diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx index 027037f..879a85e 100644 --- a/src/javascript/functions.jsx +++ b/src/javascript/functions.jsx @@ -56,7 +56,7 @@ export function removeLeadingWhitespaces(string) { } export function isListItem(string) { - return /^[\s]*[-•][\s].*[^-•]$/g.test(string); + return /^[\s]*[-•][\s].*$/g.test(string); } export function isNumberedListItem(string) { diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index edce49a..de09d62 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -8,16 +8,14 @@ import DetectTOC from './transformations/DetectTOC.jsx' import DetectListItems from './transformations/DetectListItems.jsx' import GatherBlocks from './transformations/GatherBlocks.jsx' +import DetectCodeQuoteBlocks from './transformations/DetectCodeQuoteBlocks.jsx' import DetectListLevels from './transformations/DetectListLevels.jsx' -import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectHeadlines from './transformations/DetectHeadlines.jsx' // import DetectFormats from './transformations/DetectFormats.jsx' -// import CombineSameY from './transformations/CombineSameY.jsx'; // import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' // import DetectLinks from './transformations/DetectLinks.jsx' // import HeadlineDetector from './transformations/HeadlineDetector.jsx' // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' -// import ToBlockSystem from './transformations/ToBlockSystem.jsx'; import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToMarkdown from './transformations/ToMarkdown.jsx' @@ -38,17 +36,15 @@ export default class AppState { new DetectListItems(), new GatherBlocks(), + new DetectCodeQuoteBlocks(), new DetectListLevels(), - new DetectCodeBlocks(), new DetectHeadlines(), // new DetectFormats(), - // new CombineSameY(), // new RemoveWhitespaces(), // new DetectLinks(), // new HeadlineDetector(), // new HeadlineToUppercase(), - // new ToBlockSystem(), new ToTextBlocks(), new ToMarkdown()]; diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx index dece51e..abbc684 100644 --- a/src/javascript/models/ElementType.jsx +++ b/src/javascript/models/ElementType.jsx @@ -50,6 +50,7 @@ ElementType.initEnum({ } }, CODE: { + mergeToBlock: true, toText(block:TextItemBlock) { return '```\n' + concatTextItems(block.textItems) + '```' } diff --git a/src/javascript/models/transformations/DetectCodeBlocks.jsx b/src/javascript/models/transformations/DetectCodeBlocks.jsx deleted file mode 100644 index 74aa711..0000000 --- a/src/javascript/models/transformations/DetectCodeBlocks.jsx +++ /dev/null @@ -1,84 +0,0 @@ -import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; -import ParseResult from '../ParseResult.jsx'; -import TextItemBlock from '../TextItemBlock.jsx'; -import TextItemCombiner from '../TextItemCombiner.jsx'; -import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; -import ElementType from '../ElementType.jsx'; -import { minXFromBlocks } from '../../textItemFunctions.jsx'; - -//Detect quotes, code etc.. which is transformed to markdown code syntax -export default class DetectCodeBlocks extends ToTextItemBlockTransformation { - - constructor() { - super("Detect Code/Quotes"); - } - - transform(parseResult:ParseResult) { - const {mostUsedHeight, mostUsedDistance} = parseResult.globals; - - var foundBlocks = 0; - const textCombiner = new TextItemCombiner({ - mostUsedDistance: mostUsedDistance - }); - - parseResult.pages.forEach(page => { - var minX = minXFromBlocks(page.items); - if (minX) { - const itemAreSuitable = (items) => { - for ( let item of items ) { - if (item.x == minX) { - return false; - } - if (item.height > mostUsedHeight + 1) { - return false; - } - } - return true; - }; - const newBlocks = []; - var preceedingCodeBlock; - page.items.forEach(block => { - if (block.type) { - newBlocks.push(block); - preceedingCodeBlock = null; - } else { - if (itemAreSuitable(block.textItems)) { - const mergeWithPreceedingCodeBlock = preceedingCodeBlock && preceedingCodeBlock.textItems[preceedingCodeBlock.textItems.length - 1].y - block.textItems[0].y < mostUsedDistance * 2; - if (mergeWithPreceedingCodeBlock) { - newBlocks.pop(); - } - block.annotation = REMOVED_ANNOTATION; - newBlocks.push(block); - const combineResult = textCombiner.combine(block.textItems); - if (mergeWithPreceedingCodeBlock) { - preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems); - preceedingCodeBlock.parsedElements.add(combineResult.parsedElements); - } else { - preceedingCodeBlock = new TextItemBlock({ - type: ElementType.CODE, - annotation: ADDED_ANNOTATION, - textItems: combineResult.textItems, - parsedElements: combineResult.parsedElements - }); - foundBlocks++; - } - newBlocks.push(preceedingCodeBlock); - } else { - newBlocks.push(block); - preceedingCodeBlock = null; - } - } - }); - page.items = newBlocks; - } - }); - - return new ParseResult({ - ...parseResult, - messages: ['Detected ' + foundBlocks + ' code/quote blocks.'] - }); - - } - -} - diff --git a/src/javascript/models/transformations/DetectCodeQuoteBlocks.jsx b/src/javascript/models/transformations/DetectCodeQuoteBlocks.jsx new file mode 100644 index 0000000..89a2df4 --- /dev/null +++ b/src/javascript/models/transformations/DetectCodeQuoteBlocks.jsx @@ -0,0 +1,52 @@ +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import ParseResult from '../ParseResult.jsx'; +import { DETECTED_ANNOTATION } from '../Annotation.jsx'; +import ElementType from '../ElementType.jsx'; +import { minXFromBlocks } from '../../textItemFunctions.jsx'; + +//Detect items which are code/quote blocks +export default class DetectCodeQuoteBlocks extends ToTextItemBlockTransformation { + + constructor() { + super("Detect Code/Quote Blocks"); + } + + transform(parseResult:ParseResult) { + const {mostUsedHeight} = parseResult.globals; + var foundCodeItems = 0; + parseResult.pages.forEach(page => { + var minX = minXFromBlocks(page.items); + page.items.forEach(block => { + if (!block.type && looksLikeCodeBlock(minX, block.textItems, mostUsedHeight)) { + block.annotation = DETECTED_ANNOTATION; + block.type = ElementType.CODE; + foundCodeItems++; + } + }); + }); + + return new ParseResult({ + ...parseResult, + messages: [ + 'Detected ' + foundCodeItems + ' code/quote items.', + ] + }); + + } + +} + +function looksLikeCodeBlock(minX, textItems, mostUsedHeight) { + if (textItems.length == 0) { + return false; + } + if (textItems.length == 1) { + return textItems[0].x > minX && textItems[0].height <= mostUsedHeight + 1; + } + for ( var item of textItems ) { + if (item.x == minX) { + return false; + } + } + return true; +} diff --git a/src/javascript/models/transformations/GatherBlocks.jsx b/src/javascript/models/transformations/GatherBlocks.jsx index d7d1f3d..35c92d6 100644 --- a/src/javascript/models/transformations/GatherBlocks.jsx +++ b/src/javascript/models/transformations/GatherBlocks.jsx @@ -77,7 +77,7 @@ function bigDistance(lastItem, item, minX, mostUsedDistance) { return true; } var allowedDisctance = mostUsedDistance + 1; - if (lastItem.x == item.x && item.x > minX) { + if (lastItem.x > minX && item.x > minX) { //intended elements like lists often have greater spacing allowedDisctance = mostUsedDistance + mostUsedDistance / 2; } diff --git a/test/functions.spec.js b/test/functions.spec.js index 0390170..b01a85e 100644 --- a/test/functions.spec.js +++ b/test/functions.spec.js @@ -98,8 +98,10 @@ describe('isListItem', () => { it('Match', () => { expect(isListItem('- my text')).to.equal(true); + expect(isListItem('- my text -')).to.equal(true); expect(isListItem(' - my text')).to.equal(true); expect(isListItem(' - my text')).to.equal(true); + expect(isListItem(' - my text')).to.equal(true); expect(isListItem('• my text')).to.equal(true); expect(isListItem(' • my text')).to.equal(true); @@ -111,8 +113,6 @@ describe('isListItem', () => { expect(isListItem('-my text')).to.equal(false); expect(isListItem('•my text')).to.equal(false); expect(isListItem(' -my text')).to.equal(false); - expect(isListItem('- my text -')).to.equal(false); - expect(isListItem('• my text •')).to.equal(false); }); });