[WIP] first draft complete formats transformation

This commit is contained in:
Johannes Zillmann 2017-03-24 12:30:35 +01:00
parent 81518a857b
commit 10cc7cf0ab
10 changed files with 336 additions and 22 deletions

View File

@ -52,8 +52,26 @@ export default class TextItemTable extends React.Component {
{ textItem.type ? textItem.type.name : '' } { textItem.type ? textItem.type.name : '' }
</div> </div>
<div style={ { textAlign: 'center', color: 'orange' } }> <div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? <div>
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' } Footnote-Link
</div> : '' }
{ textItem.parsedElements && textItem.parsedElements.containLinks ? <div>
Link
</div> : '' }
{ textItem.lineFormat ? <div>
{ textItem.lineFormat.name }
</div> : '' }
{ textItem.unopenedFormat ? <div>
Unopened
{ ' ' + textItem.unopenedFormat.name }
</div> : '' }
{ textItem.parsedElements && textItem.parsedElements.inlineFormats > 0 ? <div>
{ textItem.parsedElements.inlineFormats + 'x Bold/Italic' }
</div> : '' }
{ textItem.unclosedFormat ? <div>
Unclosed
{ ' ' + textItem.unclosedFormat.name }
</div> : '' }
</div> </div>
</td> </td>
<td> <td>

View File

@ -8,12 +8,11 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx' import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx' import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -59,15 +58,14 @@ export default class AppState {
new VerticalToHorizontal(), new VerticalToHorizontal(),
new PostprocessLines(), new PostprocessLines(),
new DetectTOC(), new DetectTOC(),
new DetectListItems(),
new DetectHeaders(), new DetectHeaders(),
new CompleteFormats(),
new DetectListItems(),
new GatherBlocks(), new GatherBlocks(),
new DetectCodeQuoteBlocks(), new DetectCodeQuoteBlocks(),
new DetectListLevels(), new DetectListLevels(),
// new DetectFormats(),
// new HeadlineToUppercase(),
new ToTextBlocks(), new ToTextBlocks(),
new ToMarkdown()]; new ToMarkdown()];

View File

@ -18,12 +18,14 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || []; this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || []; this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks; this.containLinks = options.containLinks;
this.inlineFormats = options.inlineFormats || 0;
} }
add(parsedElements) { add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks; this.containLinks = this.containLinks || parsedElements.containLinks;
this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
} }
} }

View File

@ -3,4 +3,23 @@ import { Enum } from 'enumify';
export default class StringFormat extends Enum { export default class StringFormat extends Enum {
} }
StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE']) StringFormat.initEnum({
STANDARD: {
needFormat: false
},
BOLD: {
needFormat: true,
startSymbol: '**',
endSymbol: '**'
},
OBLIQUE: {
needFormat: true,
startSymbol: '_',
endSymbol: '_'
},
BOLD_OBLIQUE: {
needFormat: true,
startSymbol: '**_',
endSymbol: '_**'
}
})

View File

@ -13,6 +13,10 @@ export default class TextItem extends PageItem {
this.font = options.font; this.font = options.font;
this.fontAscent = options.fontAscent; this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent; this.fontDescent = options.fontDescent;
this.lineFormat = options.lineFormat;
this.unopenedFormat = options.unopenedFormat;
this.unclosedFormat = options.unclosedFormat;
} }
} }

View File

@ -7,10 +7,8 @@ import { sortByX } from '../textItemFunctions.jsx'
//'whitespace removal', bold/emphasis annotation, link-detection, etc.. //'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemLineCompactor { export default class TextItemLineCompactor {
constructor(options) { constructor(fontToFormats) {
if (options) { this.fontToFormats = fontToFormats;
this.transformEmphasis = options.transformEmphasis || true;
}
} }
// returns a CombineResult // returns a CombineResult
@ -22,8 +20,10 @@ export default class TextItemLineCompactor {
// we can't trust order of occurence, esp. footnoteLinks like to come last // we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems); sortByX(lineItems);
var combinedItem;
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems); const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
var combinedItem;
if (resolvedLineItems.length == 1) { if (resolvedLineItems.length == 1) {
combinedItem = resolvedLineItems[0]; combinedItem = resolvedLineItems[0];
} else { } else {
@ -51,9 +51,93 @@ export default class TextItemLineCompactor {
}); });
} }
combinedItem.parsedElements = parsedElements; combinedItem.parsedElements = parsedElements;
combinedItem.lineFormat = lineFormat;
combinedItem.unopenedFormat = unopenedFormat;
combinedItem.unclosedFormat = unclosedFormat;
return combinedItem; return combinedItem;
} }
addFormats(resolvedLineItems, parsedElements) {
var inlineFormats = 0;
var openFormatType;
var openFormatItem;
var openFormatIndex;
var lastItem;
var lineFormat;
var unopenedFormat;
var unclosedFormat;
const addStartSymbol = () => {
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
...openFormatItem,
text: openFormatType.startSymbol + openFormatItem.text
}));
}
const addEndSymbol = (index) => {
resolvedLineItems.splice(index, 1, new TextItem({
...lastItem,
text: lastItem.text + openFormatType.endSymbol
}));
}
const addCompleteSymbol = () => {
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
...openFormatItem,
text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
}));
}
const rollupOpenFormat = (endIndex) => {
const formatFromBeginningOfLine = openFormatIndex == 0;
const formatToEndOfLine = endIndex == resolvedLineItems.length - 1;
if (formatFromBeginningOfLine) {
if (formatToEndOfLine) {
lineFormat = openFormatType;
} else {
unopenedFormat = openFormatType;
addEndSymbol(endIndex);
}
} else {
if (formatToEndOfLine) {
unclosedFormat = openFormatType;
addStartSymbol();
} else {
inlineFormats++;
if (lastItem === openFormatItem) {
addCompleteSymbol();
} else {
addStartSymbol();
addEndSymbol();
}
}
}
};
resolvedLineItems.slice().forEach((item, i) => {
const formatType = this.fontToFormats.get(item.font);
if (openFormatType) {
if (formatType !== openFormatType) { //closin existing format
rollupOpenFormat(i - 1);
openFormatType = formatType.needFormat ? formatType : null;
openFormatItem = formatType.needFormat ? item : null;
openFormatIndex = formatType.needFormat ? i : null;
}
} else {
if (formatType.needFormat) {
openFormatType = formatType;
openFormatItem = item;
openFormatIndex = i;
}
}
lastItem = item;
});
if (openFormatType) {
rollupOpenFormat(resolvedLineItems.length - 1);
}
parsedElements.inlineFormats = inlineFormats;
return [lineFormat, unopenedFormat, unclosedFormat];
}
resolveSpecialElements(lineItems) { resolveSpecialElements(lineItems) {
const footnoteLinks = []; const footnoteLinks = [];
const footnotes = []; const footnotes = [];

View File

@ -57,11 +57,11 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
var format; var format;
if (key == mostUsedFont) { if (key == mostUsedFont) {
format = StringFormat.STANDARD; format = StringFormat.STANDARD;
} else if (fontName.includes('bold') && fontName.includes('bold')) { } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
format = StringFormat.BOLD_OBLIQUE; format = StringFormat.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) { } else if (fontName.includes('bold')) {
format = StringFormat.BOLD; format = StringFormat.BOLD;
} else if (fontName.includes('oblique')) { } else if (fontName.includes('oblique') || fontName.includes('italic')) {
format = StringFormat.OBLIQUE; format = StringFormat.OBLIQUE;
} else if (fontName === maxHeightFont) { } else if (fontName === maxHeightFont) {
format = StringFormat.BOLD; format = StringFormat.BOLD;

View File

@ -2,6 +2,7 @@ import React from 'react';
import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx'; import ParseResult from '../../ParseResult.jsx';
import { ParsedElements } from '../../PageItem.jsx';
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx'; import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx'; import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
@ -16,13 +17,18 @@ export default class CompactLines extends ToTextItemTransformation {
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals; const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = []; const foundFootnotes = [];
const foundFootnoteLinks = []; const foundFootnoteLinks = [];
var inlineFormats = 0;
var lineFormats = 0;
var unopenedFormats = 0;
var unclosedFormats = 0;
const lineGrouper = new TextItemLineGrouper({ const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance, mostUsedDistance: mostUsedDistance,
}); });
const lineCompactor = new TextItemLineCompactor(); const lineCompactor = new TextItemLineCompactor(fontToFormats);
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
if (page.items.length > 0) { if (page.items.length > 0) {
@ -32,6 +38,13 @@ export default class CompactLines extends ToTextItemTransformation {
var lineItem; var lineItem;
if (textItemsOfLine.length == 1) { if (textItemsOfLine.length == 1) {
lineItem = textItemsOfLine[0]; lineItem = textItemsOfLine[0];
const formatType = fontToFormats.get(lineItem.font);
if (formatType.needFormat) {
lineItem.lineFormat = formatType;
lineItem.parsedElements = new ParsedElements({
completeLineFormats: 1
});
}
} else { } else {
textItemsOfLine.forEach(item => { textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION; item.annotation = REMOVED_ANNOTATION;
@ -50,7 +63,11 @@ export default class CompactLines extends ToTextItemTransformation {
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>); const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes); foundFootnotes.push.apply(foundFootnotes, footnotes);
} }
inlineFormats += lineItem.parsedElements.inlineFormats;
} }
if (lineItem.lineFormat) lineFormats++;
if (lineItem.unopenedFormat) unopenedFormats++;
if (lineItem.unclosedFormat) unclosedFormats++;
lineItem.text = lineItem.text.trim(); lineItem.text = lineItem.text.trim();
newItems.push(lineItem); newItems.push(lineItem);
}); });
@ -62,9 +79,10 @@ export default class CompactLines extends ToTextItemTransformation {
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: [ messages: [
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']', 'Detected ' + lineFormats + ' line formats',
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']', 'Detected ' + inlineFormats + ' inline formats',
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']', 'Detected ' + unclosedFormats + ' opened un-closed formats',
'Detected ' + unopenedFormats + ' un-opened closed formats',
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>, <span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>, <span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
] ]

View File

@ -0,0 +1,170 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
//Complete unopened/unclosed bold/italic formats
export default class CompleteFormats extends ToTextItemTransformation {
constructor() {
super("Complete Bold/Italics");
}
transform(parseResult:ParseResult) {
// remove line formats from headers
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (item.type && item.type.headline) {
if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
item.lineFormat = null;
item.unopenedFormat = null;
item.unclosedFormat = null;
item.annotation = UNCHANGED_ANNOTATION;
}
}
});
});
//close open formats
parseResult.pages.forEach(page => {
const itemStack = new ItemStack();
page.items.forEach(item => {
itemStack.consume(item);
});
page.items = itemStack.getResults();
});
return new ParseResult({
...parseResult,
messages: []
});
}
}
class ItemStack {
constructor() {
this.openFormat;
this.openFormatItem = [];
this.resultItems = [];
}
cache(textItem, format) {
this.openFormat = format;
this.openFormatItem = textItem;
}
closeOpenFormat() {
if (this.openFormat) {
this.openFormatItem.annotation = REMOVED_ANNOTATION;
this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
this.clear();
}
}
clear() {
this.openFormat = null;
this.openFormatItem = null;
}
writeToResults(textItem) {
this.resultItems.push(textItem);
}
getResults() {
if (this.openFormat) {
this.closeOpenFormat();
}
return this.resultItems;
}
consume(item) {
const te = item.text;
var newItem;
const handleFreshUnopened = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.unopenedFormat);
}
const handleFreshLine = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.lineFormat);
this.cache(newItem, item.lineFormat);
}
const handleFreshUnclosed = () => {
if (newItem) {
this.cache(newItem, item.unclosedFormat);
newItem = null;
} else {
this.cache(item, item.unclosedFormat);
}
}
//flush open format if possible
if (this.openFormat) {
if (item.unopenedFormat) {
if (item.unopenedFormat === this.openFormat) {
//good, closing an opened
this.clear();
} else {
this.closeOpenFormat();
handleFreshUnopened();
}
}
if (item.lineFormat) {
if (item.lineFormat === this.openFormat) {
this.cache(item, item.lineFormat);
} else {
this.closeOpenFormat();
handleFreshLine();
}
}
if (item.unclosedFormat) {
this.closeOpenFormat();
handleFreshUnclosed();
}
if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
this.closeOpenFormat();
}
} else { // handle fresh items
if (item.unopenedFormat) {
handleFreshUnopened()
}
if (item.lineFormat) {
handleFreshLine();
}
if (item.unclosedFormat) {
handleFreshUnclosed();
}
}
this.writeToResults(item);
if (newItem) {
this.writeToResults(newItem);
}
}
}
function textItemWithOpening(textItem, format) {
return new TextItem({
...textItem,
text: format.startSymbol + textItem.text,
annotation: ADDED_ANNOTATION
});
}
function textItemWithClosing(textItem, format) {
return new TextItem({
...textItem,
text: textItem.text + format.endSymbol,
annotation: ADDED_ANNOTATION
});
}

View File

@ -3,6 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx'; import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx'; import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem } from '../../../functions.jsx';
//Detect items starting with -, , etc... //Detect items starting with -, , etc...
export default class DetectHeaders extends ToTextItemTransformation { export default class DetectHeaders extends ToTextItemTransformation {
@ -56,7 +57,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
var lastHeight; var lastHeight;
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(textItem => {
if (!textItem.type && textItem.height > mostUsedHeight) { if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) { if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
heights.push(textItem.height); heights.push(textItem.height);
} }
@ -69,7 +70,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
const headlineType = headlineByLevel(2 + i); const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(textItem => {
if (!textItem.type && textItem.height == height) { if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
detectedHeaders++; detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION; textItem.annotation = DETECTED_ANNOTATION;
textItem.type = headlineType; textItem.type = headlineType;