From c4679238cdadc65893f99c8840a5a6e7b09e4898 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Tue, 28 Mar 2017 09:00:21 +0200 Subject: [PATCH] Improve list detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ‘ ‘ on compact lines when line starts with list character * Add – as list character * rename functions.jsx to stringFunctions.jsx --- src/javascript/models/HeadlineFinder.jsx | 2 +- src/javascript/models/LineConverter.jsx | 19 +++++++++++++------ .../textitem/DetectHeaders.jsx | 4 ++-- .../textitem/DetectListItems.jsx | 14 +++++++++----- .../transformations/textitem/DetectTOC.jsx | 2 +- .../textitem/RemoveRepetitiveElements.jsx | 2 +- .../{functions.jsx => stringFunctions.jsx} | 11 ++++++++++- ...ctions.spec.js => stringFunctions.spec.js} | 5 ++++- 8 files changed, 41 insertions(+), 18 deletions(-) rename src/javascript/{functions.jsx => stringFunctions.jsx} (92%) rename test/{functions.spec.js => stringFunctions.spec.js} (97%) diff --git a/src/javascript/models/HeadlineFinder.jsx b/src/javascript/models/HeadlineFinder.jsx index 4061400..5b9d8bd 100644 --- a/src/javascript/models/HeadlineFinder.jsx +++ b/src/javascript/models/HeadlineFinder.jsx @@ -1,4 +1,4 @@ -import { normalizedCharCodeArray } from '../functions.jsx' +import { normalizedCharCodeArray } from '../stringFunctions.jsx' export default class HeadlineFinder { diff --git a/src/javascript/models/LineConverter.jsx b/src/javascript/models/LineConverter.jsx index 9729cd2..bb7160a 100644 --- a/src/javascript/models/LineConverter.jsx +++ b/src/javascript/models/LineConverter.jsx @@ -4,7 +4,7 @@ import WordType from './markdown/WordType.jsx'; import LineItem from './LineItem.jsx'; import StashingStream from './StashingStream.jsx'; import { ParsedElements } from './PageItem.jsx'; -import { isNumber } from '../functions.jsx' +import { isNumber, isListItemCharacter } from '../stringFunctions.jsx' import { sortByX } from '../pageItemFunctions.jsx' // Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like @@ -146,13 +146,20 @@ function combineText(textItems) { var text = ''; var lastItem; textItems.forEach(textItem => { - if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) { - const xDistance = textItem.x - lastItem.x - lastItem.width; - if (xDistance > 5) { - text += ' '; + var textToAdd = textItem.text; + if (!text.endsWith(' ') && !textToAdd.startsWith(' ')) { + if (lastItem) { + const xDistance = textItem.x - lastItem.x - lastItem.width; + if (xDistance > 5) { + text += ' '; + } + } else { + if (isListItemCharacter(textItem.text)) { + textToAdd += ' '; + } } } - text += textItem.text; + text += textToAdd; lastItem = textItem; }); return text; diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index 93d36ad..11a0631 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -3,9 +3,9 @@ import ParseResult from '../../ParseResult.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx'; -import { isListItem } from '../../../functions.jsx'; +import { isListItem } from '../../../stringFunctions.jsx'; -//Detect items starting with -, •, etc... +//Detect headlines based on heights export default class DetectHeaders extends ToLineItemTransformation { constructor() { diff --git a/src/javascript/models/transformations/textitem/DetectListItems.jsx b/src/javascript/models/transformations/textitem/DetectListItems.jsx index 8bdd197..8ff0856 100644 --- a/src/javascript/models/transformations/textitem/DetectListItems.jsx +++ b/src/javascript/models/transformations/textitem/DetectListItems.jsx @@ -1,9 +1,10 @@ import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import LineItem from '../../LineItem.jsx'; +import Word from '../../Word.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; -import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; +import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx'; //Detect items starting with -, •, etc... export default class DetectListItems extends ToLineItemTransformation { @@ -21,17 +22,20 @@ export default class DetectListItems extends ToLineItemTransformation { newItems.push(item); if (!item.type) { var text = item.text(); - if (isListItem(text)) { + if (isListItemCharacter(item.words[0].string)) { foundListItems++ - const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length); - if (textWithDash === text) { + if (item.words[0].string === '-') { item.annotation = DETECTED_ANNOTATION; item.type = ElementType.LIST; } else { item.annotation = REMOVED_ANNOTATION; + const newWords = item.words.map(word => new Word({ + ...word + })); + newWords[0].string = '-'; newItems.push(new LineItem({ ...item, - text: textWithDash, + words: newWords, annotation: ADDED_ANNOTATION, type: ElementType.LIST })); diff --git a/src/javascript/models/transformations/textitem/DetectTOC.jsx b/src/javascript/models/transformations/textitem/DetectTOC.jsx index 93ca758..f81e5d8 100644 --- a/src/javascript/models/transformations/textitem/DetectTOC.jsx +++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx @@ -6,7 +6,7 @@ import HeadlineFinder from '../../HeadlineFinder.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx'; -import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx' +import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx' //Detect table of contents pages plus linked headlines export default class DetectTOC extends ToLineItemTransformation { diff --git a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx index d84f82e..85c2b92 100644 --- a/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx +++ b/src/javascript/models/transformations/textitem/RemoveRepetitiveElements.jsx @@ -2,7 +2,7 @@ import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import { REMOVED_ANNOTATION } from '../../Annotation.jsx'; -import { isDigit } from '../../../functions.jsx' +import { isDigit } from '../../../stringFunctions.jsx' function hashCodeIgnoringSpacesAndNumbers(string) { diff --git a/src/javascript/functions.jsx b/src/javascript/stringFunctions.jsx similarity index 92% rename from src/javascript/functions.jsx rename to src/javascript/stringFunctions.jsx index 1680a69..7833348 100644 --- a/src/javascript/functions.jsx +++ b/src/javascript/stringFunctions.jsx @@ -92,8 +92,17 @@ export function suffixBeforeWhitespace(string, suffix) { } } +export function isListItemCharacter(string) { + if (string.length > 1) { + return false + } + const char = string.charAt(0); + return char === '-' || char === '•' || char === '–'; +} + + export function isListItem(string) { - return /^[\s]*[-•][\s].*$/g.test(string); + return /^[\s]*[-•–][\s].*$/g.test(string); } export function isNumberedListItem(string) { diff --git a/test/functions.spec.js b/test/stringFunctions.spec.js similarity index 97% rename from test/functions.spec.js rename to test/stringFunctions.spec.js index 8aa553e..cb82fd5 100644 --- a/test/functions.spec.js +++ b/test/stringFunctions.spec.js @@ -1,6 +1,6 @@ import { expect } from 'chai'; -import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx' +import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/stringFunctions.jsx' describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => { @@ -144,6 +144,9 @@ describe('functions: isListItem', () => { expect(isListItem('• my text')).to.equal(true); expect(isListItem(' • my text')).to.equal(true); expect(isListItem(' • my text')).to.equal(true); + + expect(isListItem('– my text')).to.equal(true); + expect(isListItem(' – my text')).to.equal(true); }); it('No Match', () => {