Improve list detection

* Add ‘ ‘ on compact lines when line starts with list character
* Add – as list character
* rename functions.jsx to stringFunctions.jsx
This commit is contained in:
Johannes Zillmann 2017-03-28 09:00:21 +02:00
parent 106e2bfa8e
commit c4679238cd
8 changed files with 41 additions and 18 deletions

View File

@ -1,4 +1,4 @@
import { normalizedCharCodeArray } from '../functions.jsx' import { normalizedCharCodeArray } from '../stringFunctions.jsx'
export default class HeadlineFinder { export default class HeadlineFinder {

View File

@ -4,7 +4,7 @@ import WordType from './markdown/WordType.jsx';
import LineItem from './LineItem.jsx'; import LineItem from './LineItem.jsx';
import StashingStream from './StashingStream.jsx'; import StashingStream from './StashingStream.jsx';
import { ParsedElements } from './PageItem.jsx'; import { ParsedElements } from './PageItem.jsx';
import { isNumber } from '../functions.jsx' import { isNumber, isListItemCharacter } from '../stringFunctions.jsx'
import { sortByX } from '../pageItemFunctions.jsx' import { sortByX } from '../pageItemFunctions.jsx'
// Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like // Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like
@ -146,13 +146,20 @@ function combineText(textItems) {
var text = ''; var text = '';
var lastItem; var lastItem;
textItems.forEach(textItem => { textItems.forEach(textItem => {
if (lastItem && !text.endsWith(' ') && !textItem.text.startsWith(' ')) { var textToAdd = textItem.text;
if (!text.endsWith(' ') && !textToAdd.startsWith(' ')) {
if (lastItem) {
const xDistance = textItem.x - lastItem.x - lastItem.width; const xDistance = textItem.x - lastItem.x - lastItem.width;
if (xDistance > 5) { if (xDistance > 5) {
text += ' '; text += ' ';
} }
} else {
if (isListItemCharacter(textItem.text)) {
textToAdd += ' ';
} }
text += textItem.text; }
}
text += textToAdd;
lastItem = textItem; lastItem = textItem;
}); });
return text; return text;

View File

@ -3,9 +3,9 @@ import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem } from '../../../functions.jsx'; import { isListItem } from '../../../stringFunctions.jsx';
//Detect items starting with -, , etc... //Detect headlines based on heights
export default class DetectHeaders extends ToLineItemTransformation { export default class DetectHeaders extends ToLineItemTransformation {
constructor() { constructor() {

View File

@ -1,9 +1,10 @@
import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import LineItem from '../../LineItem.jsx'; import LineItem from '../../LineItem.jsx';
import Word from '../../Word.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; import { isListItemCharacter, isNumberedListItem } from '../../../stringFunctions.jsx';
//Detect items starting with -, , etc... //Detect items starting with -, , etc...
export default class DetectListItems extends ToLineItemTransformation { export default class DetectListItems extends ToLineItemTransformation {
@ -21,17 +22,20 @@ export default class DetectListItems extends ToLineItemTransformation {
newItems.push(item); newItems.push(item);
if (!item.type) { if (!item.type) {
var text = item.text(); var text = item.text();
if (isListItem(text)) { if (isListItemCharacter(item.words[0].string)) {
foundListItems++ foundListItems++
const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length); if (item.words[0].string === '-') {
if (textWithDash === text) {
item.annotation = DETECTED_ANNOTATION; item.annotation = DETECTED_ANNOTATION;
item.type = ElementType.LIST; item.type = ElementType.LIST;
} else { } else {
item.annotation = REMOVED_ANNOTATION; item.annotation = REMOVED_ANNOTATION;
const newWords = item.words.map(word => new Word({
...word
}));
newWords[0].string = '-';
newItems.push(new LineItem({ newItems.push(new LineItem({
...item, ...item,
text: textWithDash, words: newWords,
annotation: ADDED_ANNOTATION, annotation: ADDED_ANNOTATION,
type: ElementType.LIST type: ElementType.LIST
})); }));

View File

@ -6,7 +6,7 @@ import HeadlineFinder from '../../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isDigit, isNumber, wordMatch, hasOnly } from '../../../functions.jsx' import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx'
//Detect table of contents pages plus linked headlines //Detect table of contents pages plus linked headlines
export default class DetectTOC extends ToLineItemTransformation { export default class DetectTOC extends ToLineItemTransformation {

View File

@ -2,7 +2,7 @@ import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../../Annotation.jsx'; import { REMOVED_ANNOTATION } from '../../Annotation.jsx';
import { isDigit } from '../../../functions.jsx' import { isDigit } from '../../../stringFunctions.jsx'
function hashCodeIgnoringSpacesAndNumbers(string) { function hashCodeIgnoringSpacesAndNumbers(string) {

View File

@ -92,8 +92,17 @@ export function suffixBeforeWhitespace(string, suffix) {
} }
} }
export function isListItemCharacter(string) {
if (string.length > 1) {
return false
}
const char = string.charAt(0);
return char === '-' || char === '•' || char === '';
}
export function isListItem(string) { export function isListItem(string) {
return /^[\s]*[-•][\s].*$/g.test(string); return /^[\s]*[-•][\s].*$/g.test(string);
} }
export function isNumberedListItem(string) { export function isNumberedListItem(string) {

View File

@ -1,6 +1,6 @@
import { expect } from 'chai'; import { expect } from 'chai';
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx' import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, removeTrailingWhitespaces, prefixAfterWhitespace, suffixBeforeWhitespace, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/stringFunctions.jsx'
describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => { describe('functions: hasUpperCaseCharacterInMiddleOfWord', () => {
@ -144,6 +144,9 @@ describe('functions: isListItem', () => {
expect(isListItem('• my text')).to.equal(true); expect(isListItem('• my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true); expect(isListItem(' • my text')).to.equal(true);
expect(isListItem(' • my text')).to.equal(true); expect(isListItem(' • my text')).to.equal(true);
expect(isListItem(' my text')).to.equal(true);
expect(isListItem(' my text')).to.equal(true);
}); });
it('No Match', () => { it('No Match', () => {