mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
[WIP] TOC headline parsing
This commit is contained in:
parent
c9352d8396
commit
6f69566e98
@ -1,5 +1,11 @@
|
|||||||
|
const MIN_DIGIT_CHAR_CODE = 48;
|
||||||
|
const MAX_DIGIT_CHAR_CODE = 57;
|
||||||
|
const WHITESPACE_CHAR_CODE = 32;
|
||||||
|
const TAB_CHAR_CODE = 9;
|
||||||
|
const DOT_CHAR_CODE = 46;
|
||||||
|
|
||||||
export function isDigit(charCode) {
|
export function isDigit(charCode) {
|
||||||
return charCode >= 48 && charCode <= 57;
|
return charCode >= MIN_DIGIT_CHAR_CODE && charCode <= MAX_DIGIT_CHAR_CODE;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isNumber(string) {
|
export function isNumber(string) {
|
||||||
@ -27,3 +33,17 @@ export function hasUpperCaseCharacterInMiddleOfWord(text) {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove whitespace/dots + to uppercase
|
||||||
|
export function normalizedCharCodeArray(string) {
|
||||||
|
string = string.toUpperCase();
|
||||||
|
return charCodeArray(string).filter(charCode => charCode != WHITESPACE_CHAR_CODE && charCode != TAB_CHAR_CODE && charCode != DOT_CHAR_CODE);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function charCodeArray(string) {
|
||||||
|
const charCodes = [];
|
||||||
|
for (var i = 0; i < string.length; i++) {
|
||||||
|
charCodes.push(string.charCodeAt(i));
|
||||||
|
}
|
||||||
|
return charCodes;
|
||||||
|
}
|
@ -8,6 +8,7 @@ import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
|||||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
import DetectLists from './transformations/DetectLists.jsx'
|
import DetectLists from './transformations/DetectLists.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||||
|
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
// import CombineSameY from './transformations/CombineSameY.jsx';
|
// import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
@ -35,6 +36,7 @@ export default class AppState {
|
|||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
new DetectLists(),
|
new DetectLists(),
|
||||||
new DetectCodeBlocks(),
|
new DetectCodeBlocks(),
|
||||||
|
new DetectHeadlines(),
|
||||||
|
|
||||||
// new DetectFormats(),
|
// new DetectFormats(),
|
||||||
// new CombineSameY(),
|
// new CombineSameY(),
|
||||||
|
40
src/javascript/models/HeadlineFinder.jsx
Normal file
40
src/javascript/models/HeadlineFinder.jsx
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import { normalizedCharCodeArray } from '../functions.jsx'
|
||||||
|
|
||||||
|
export default class HeadlineFinder {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.headlineCharCodes = normalizedCharCodeArray(options.headline);
|
||||||
|
this.stackedTextItems = [];
|
||||||
|
this.stackedChars = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
consume(textItem) {
|
||||||
|
const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
|
||||||
|
const matchAll = this.matchAll(normalizedCharCodes);
|
||||||
|
if (matchAll) {
|
||||||
|
this.stackedTextItems.push(textItem);
|
||||||
|
this.stackedChars += normalizedCharCodes.length;
|
||||||
|
if (this.stackedChars == this.headlineCharCodes.length) {
|
||||||
|
return this.stackedTextItems;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (this.stackedChars > 0) {
|
||||||
|
this.stackedChars = 0;
|
||||||
|
this.stackedTextItems = [];
|
||||||
|
this.consume(textItem); // test again without stack
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
matchAll(normalizedCharCodes) {
|
||||||
|
for (var i = 0; i < normalizedCharCodes.length; i++) {
|
||||||
|
const headlineChar = this.headlineCharCodes[this.stackedChars + i];
|
||||||
|
const textItemChar = normalizedCharCodes[i];
|
||||||
|
if (textItemChar != headlineChar) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
@ -35,9 +35,9 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
|||||||
|
|
||||||
var lastItemX;
|
var lastItemX;
|
||||||
var currentLevel = 0;
|
var currentLevel = 0;
|
||||||
|
var xByLevel = {};
|
||||||
var itemsBeforeFirstLineItem = [];
|
var itemsBeforeFirstLineItem = [];
|
||||||
var listBlockItems = [];
|
var listBlockItems = [];
|
||||||
var xByLevel = {};
|
|
||||||
|
|
||||||
const pushLineItem = (originalItem, text, setLevel) => {
|
const pushLineItem = (originalItem, text, setLevel) => {
|
||||||
if (lastItemX && setLevel) {
|
if (lastItemX && setLevel) {
|
||||||
|
@ -3,8 +3,9 @@ import ParseResult from '../ParseResult.jsx';
|
|||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfBlock from '../PdfBlock.jsx';
|
import PdfBlock from '../PdfBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
|
import HeadlineFinder from '../HeadlineFinder.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { TOC_BLOCK, HEADLINE2 } from '../MarkdownElements.jsx';
|
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||||
import { isDigit } from '../../functions.jsx'
|
import { isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
//Detect table of contents pages
|
//Detect table of contents pages
|
||||||
@ -22,14 +23,16 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
mostUsedDistance: mostUsedDistance
|
mostUsedDistance: mostUsedDistance
|
||||||
});
|
});
|
||||||
|
|
||||||
var lastLevel = 0;
|
const linkLeveler = new LinkLeveler();
|
||||||
const itemLeveler = new ItemLeveler();
|
var tocLinks = [];
|
||||||
|
var lastTocPage;
|
||||||
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
|
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||||
var linesCount = 0;
|
var linesCount = 0;
|
||||||
var linesWithDigitsCount = 0;
|
var linesWithDigitsCount = 0;
|
||||||
var lineItemsWithDigits = [];
|
var lineItemsWithDigits = [];
|
||||||
const unknownBlocks = new Set();
|
const unknownBlocks = new Set();
|
||||||
var headlineBlock;
|
var headlineBlock;
|
||||||
|
const pageTocLinks = [];
|
||||||
page.blocks.forEach(block => {
|
page.blocks.forEach(block => {
|
||||||
var blockHasLinesWithDigits = false;
|
var blockHasLinesWithDigits = false;
|
||||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||||
@ -38,8 +41,10 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
linesCount++
|
linesCount++
|
||||||
var lineText = lineItem.text.replace(/\./g, '').trim();
|
var lineText = lineItem.text.replace(/\./g, '').trim();
|
||||||
var endsWithDigit = false;
|
var endsWithDigit = false;
|
||||||
|
var digits = [];
|
||||||
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
|
||||||
lineText = lineText.substring(0, lineText.length - 2);
|
digits.unshift(lineText.charAt(lineText.length - 1));
|
||||||
|
lineText = lineText.substring(0, lineText.length - 1);
|
||||||
endsWithDigit = true;
|
endsWithDigit = true;
|
||||||
}
|
}
|
||||||
lineText = lineText.trim();
|
lineText = lineText.trim();
|
||||||
@ -50,6 +55,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
}
|
}
|
||||||
linesWithDigitsCount++;
|
linesWithDigitsCount++;
|
||||||
blockHasLinesWithDigits = true;
|
blockHasLinesWithDigits = true;
|
||||||
|
pageTocLinks.push(new TocLink({
|
||||||
|
pageNumber: parseInt(digits.join('')),
|
||||||
|
textItem: new TextItem({
|
||||||
|
...lineItem,
|
||||||
|
text: lineText
|
||||||
|
})
|
||||||
|
}));
|
||||||
lineItemsWithDigits.push(new TextItem({
|
lineItemsWithDigits.push(new TextItem({
|
||||||
...lineItem,
|
...lineItem,
|
||||||
text: lineText
|
text: lineText
|
||||||
@ -67,8 +79,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// page has been processed
|
||||||
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
if (linesWithDigitsCount * 100 / linesCount > 75) {
|
||||||
tocPages.push(page.index + 1);
|
tocPages.push(page.index + 1);
|
||||||
|
lastTocPage = page;
|
||||||
|
linkLeveler.levelPageItems(pageTocLinks);
|
||||||
|
tocLinks = tocLinks.concat(pageTocLinks);
|
||||||
|
|
||||||
const newBlocks = [];
|
const newBlocks = [];
|
||||||
page.blocks.forEach((block) => {
|
page.blocks.forEach((block) => {
|
||||||
if (!unknownBlocks.has(block)) {
|
if (!unknownBlocks.has(block)) {
|
||||||
@ -83,17 +100,50 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// lastLevel = processLevels(lineItemsWithDigits, lastLevel);
|
|
||||||
itemLeveler.level(lineItemsWithDigits);
|
|
||||||
newBlocks.push(new PdfBlock({
|
|
||||||
textItems: lineItemsWithDigits,
|
|
||||||
type: TOC_BLOCK,
|
|
||||||
annotation: ADDED_ANNOTATION
|
|
||||||
}));
|
|
||||||
page.blocks = newBlocks;
|
page.blocks = newBlocks;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
//all pages have been processed
|
||||||
|
var foundHeadlines = tocLinks.length;
|
||||||
|
const notFoundHeadlines = [];
|
||||||
|
if (tocPages.length > 0) {
|
||||||
|
tocLinks.forEach(tocLink => {
|
||||||
|
var linkedPage = parseResult.content[tocLink.pageNumber - 1];
|
||||||
|
var foundHeadline = false;
|
||||||
|
if (linkedPage) {
|
||||||
|
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
||||||
|
if (!foundHeadline) { // pages are off by 1 ?
|
||||||
|
linkedPage = parseResult.content[tocLink.pageNumber];
|
||||||
|
if (linkedPage) {
|
||||||
|
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//TODO sometimes pages are off. We could try the page range from pre to next ...
|
||||||
|
}
|
||||||
|
if (!foundHeadline) {
|
||||||
|
notFoundHeadlines.push(tocLink);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
lastTocPage.blocks.push(new PdfBlock({
|
||||||
|
textItems: tocLinks.map(tocLink => {
|
||||||
|
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
|
||||||
|
return tocLink.textItem
|
||||||
|
}),
|
||||||
|
type: TOC_BLOCK,
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
const messages = [];
|
||||||
|
messages.push('Detected ' + tocPages.length + ' table of content pages');
|
||||||
|
if (foundHeadlines > 0) {
|
||||||
|
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
|
||||||
|
}
|
||||||
|
if (notFoundHeadlines.length > 0) {
|
||||||
|
messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
|
||||||
|
}
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
globals: {
|
globals: {
|
||||||
@ -101,27 +151,61 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
|||||||
tocPages: tocPages
|
tocPages: tocPages
|
||||||
|
|
||||||
},
|
},
|
||||||
messages: ['Detected ' + tocPages.length + ' table of content pages']
|
messages: messages
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findHeadline(page, tocLink, textCombiner) {
|
||||||
|
const headline = tocLink.textItem.text;
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: headline
|
||||||
|
});
|
||||||
|
var blockIndex = 0;
|
||||||
|
var lastBlock;
|
||||||
|
for ( var block of page.blocks ) {
|
||||||
|
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||||
|
for ( var item of itemsGroupedByY ) {
|
||||||
|
const headlineItems = headlineFinder.consume(item);
|
||||||
|
if (headlineItems) {
|
||||||
|
const usedItems = headlineFinder.stackedTextItems;
|
||||||
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
|
if (usedItems.length > itemsGroupedByY.length) {
|
||||||
|
// 2 line headline
|
||||||
|
lastBlock.annotation = REMOVED_ANNOTATION;
|
||||||
|
}
|
||||||
|
page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
|
||||||
|
textItems: [new TextItem({
|
||||||
|
...usedItems[0],
|
||||||
|
text: headline
|
||||||
|
})],
|
||||||
|
type: headlineByLevel(tocLink.level + 2),
|
||||||
|
annotation: ADDED_ANNOTATION
|
||||||
|
}));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
blockIndex++;
|
||||||
|
lastBlock = block;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
class ItemLeveler {
|
|
||||||
|
class LinkLeveler {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.levelByMethod = null;
|
this.levelByMethod = null;
|
||||||
this.uniqueFonts = [];
|
this.uniqueFonts = [];
|
||||||
this.headlines = [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
level(lineItemsWithDigits) {
|
levelPageItems(tocLinks:TocLink[]) {
|
||||||
if (!this.levelByMethod) {
|
if (!this.levelByMethod) {
|
||||||
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
||||||
if (uniqueX.length > 1) {
|
if (uniqueX.length > 1) {
|
||||||
this.levelByMethod = this.levelByXDiff;
|
this.levelByMethod = this.levelByXDiff;
|
||||||
} else {
|
} else {
|
||||||
const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
|
const uniqueFonts = this.calculateUniqueFonts(tocLinks);
|
||||||
if (uniqueFonts.length > 1) {
|
if (uniqueFonts.length > 1) {
|
||||||
this.uniqueFonts = uniqueFonts;
|
this.uniqueFonts = uniqueFonts;
|
||||||
this.levelByMethod = this.levelByFont;
|
this.levelByMethod = this.levelByFont;
|
||||||
@ -130,46 +214,31 @@ class ItemLeveler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.levelByMethod(lineItemsWithDigits);
|
this.levelByMethod(tocLinks);
|
||||||
}
|
}
|
||||||
|
|
||||||
levelByXDiff(lineItemsWithDigits) {
|
levelByXDiff(tocLinks) {
|
||||||
const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
|
const uniqueX = this.calculateUniqueX(tocLinks);
|
||||||
lineItemsWithDigits.forEach(item => {
|
tocLinks.forEach(link => {
|
||||||
const level = uniqueX.indexOf(item.x);
|
link.level = uniqueX.indexOf(link.textItem.x);
|
||||||
this.headlines.push(new Headline({
|
|
||||||
level: level,
|
|
||||||
text: item.text
|
|
||||||
}));
|
|
||||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
levelByFont(lineItemsWithDigits) {
|
levelByFont(tocLinks) {
|
||||||
lineItemsWithDigits.forEach(item => {
|
tocLinks.forEach(link => {
|
||||||
const level = this.uniqueFonts.indexOf(item.font);
|
link.level = this.uniqueFonts.indexOf(link.textItem.font);
|
||||||
this.headlines.push(new Headline({
|
|
||||||
level: level,
|
|
||||||
text: item.text
|
|
||||||
}));
|
|
||||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
levelToZero(lineItemsWithDigits) {
|
levelToZero(tocLinks) {
|
||||||
lineItemsWithDigits.forEach(item => {
|
tocLinks.forEach(link => {
|
||||||
const level = 0;
|
link.level = 0;
|
||||||
this.headlines.push(new Headline({
|
|
||||||
level: level,
|
|
||||||
text: item.text
|
|
||||||
}));
|
|
||||||
item.text = ' '.repeat(level * 3) + '- ' + item.text;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
calculateUniqueX(lineItemsWithDigits) {
|
calculateUniqueX(tocLinks) {
|
||||||
var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
|
var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
|
||||||
if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
|
if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
|
||||||
return uniquesArray;
|
return uniquesArray;
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@ -180,9 +249,9 @@ class ItemLeveler {
|
|||||||
return uniqueX;
|
return uniqueX;
|
||||||
}
|
}
|
||||||
|
|
||||||
calculateUniqueFonts(lineItemsWithDigits) {
|
calculateUniqueFonts(tocLinks) {
|
||||||
var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
|
var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
|
||||||
if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
|
if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
|
||||||
return uniquesArray;
|
return uniquesArray;
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@ -191,9 +260,10 @@ class ItemLeveler {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class Headline {
|
class TocLink {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.level = options.level;
|
this.textItem = options.textItem;
|
||||||
this.text = options.text;
|
this.pageNumber = options.pageNumber;
|
||||||
|
this.level = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
134
test/HeadlineFinder.spec.js
Normal file
134
test/HeadlineFinder.spec.js
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import { expect } from 'chai';
|
||||||
|
|
||||||
|
import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
|
||||||
|
import TextItem from '../src/javascript/models/TextItem.jsx';
|
||||||
|
|
||||||
|
describe('HeadlineFinder', () => {
|
||||||
|
|
||||||
|
|
||||||
|
it('Not Found - Case 1', () => {
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: 'My Little Headline'
|
||||||
|
});
|
||||||
|
const item1 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item2 = new TextItem({
|
||||||
|
text: 'Little'
|
||||||
|
});
|
||||||
|
const item3 = new TextItem({
|
||||||
|
text: ' Headline2'
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
|
expect(headlineFinder.consume(item3)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Found - Simple', () => {
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: 'My Little Headline'
|
||||||
|
});
|
||||||
|
const item1 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item2 = new TextItem({
|
||||||
|
text: 'Little'
|
||||||
|
});
|
||||||
|
const item3 = new TextItem({
|
||||||
|
text: ' Headline'
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Found - Waste in beginning', () => {
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: 'My Little Headline'
|
||||||
|
});
|
||||||
|
const item0 = new TextItem({
|
||||||
|
text: 'Waste '
|
||||||
|
});
|
||||||
|
const item1 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item2 = new TextItem({
|
||||||
|
text: 'Little'
|
||||||
|
});
|
||||||
|
const item3 = new TextItem({
|
||||||
|
text: ' Headline'
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
|
||||||
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Found - Duplicate in beginning', () => {
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: 'My Little Headline'
|
||||||
|
});
|
||||||
|
const item0 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item1 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item2 = new TextItem({
|
||||||
|
text: 'Little'
|
||||||
|
});
|
||||||
|
const item3 = new TextItem({
|
||||||
|
text: ' Headline'
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(headlineFinder.consume(item0)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
|
||||||
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Found - Mixed up case and Whitespace', () => {
|
||||||
|
const headlineFinder = new HeadlineFinder({
|
||||||
|
headline: 'MYLitt le HEADline'
|
||||||
|
});
|
||||||
|
const item1 = new TextItem({
|
||||||
|
text: 'My '
|
||||||
|
});
|
||||||
|
const item2 = new TextItem({
|
||||||
|
text: 'Little'
|
||||||
|
});
|
||||||
|
const item3 = new TextItem({
|
||||||
|
text: ' Headline'
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(headlineFinder.consume(item1)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
|
||||||
|
expect(headlineFinder.consume(item2)).to.equal(null);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
|
||||||
|
expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
@ -1,6 +1,6 @@
|
|||||||
import { expect } from 'chai';
|
import { expect } from 'chai';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'
|
||||||
|
|
||||||
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
||||||
|
|
||||||
@ -37,3 +37,42 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
|
|||||||
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
|
expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('charCodeArray', () => {
|
||||||
|
it('Charcodes', () => {
|
||||||
|
expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Convert Back', () => {
|
||||||
|
expect(String.fromCharCode.apply(null, charCodeArray("word"))).to.equal("word");
|
||||||
|
expect(String.fromCharCode.apply(null, charCodeArray("WORD"))).to.equal("WORD");
|
||||||
|
expect(String.fromCharCode.apply(null, charCodeArray("a word"))).to.equal("a word");
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('normalizedCharCodeArray', () => {
|
||||||
|
|
||||||
|
it('No Change', () => {
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD23"))).to.equal("WORD23");
|
||||||
|
});
|
||||||
|
|
||||||
|
it('lowecaseToUpperCase', () => {
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word"))).to.equal("WORD");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd"))).to.equal("WORD");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word23"))).to.equal("WORD23");
|
||||||
|
});
|
||||||
|
|
||||||
|
it('RemoveWhiteSpace', () => {
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("A WORD"))).to.equal("AWORD");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("SOME LITTLE SENTENCE."))).to.equal("SOMELITTLESENTENCE");
|
||||||
|
});
|
||||||
|
|
||||||
|
it('All', () => {
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("a word"))).to.equal("AWORD");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd 4 u"))).to.equal("WORD4U");
|
||||||
|
expect(String.fromCharCode.apply(null, normalizedCharCodeArray("Some little sentence."))).to.equal("SOMELITTLESENTENCE");
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user