[WIP] first draft complete formats transformation

This commit is contained in:
Johannes Zillmann 2017-03-24 12:30:35 +01:00
parent 81518a857b
commit 10cc7cf0ab
10 changed files with 336 additions and 22 deletions

View File

@ -52,8 +52,26 @@ export default class TextItemTable extends React.Component {
{ textItem.type ? textItem.type.name : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? <div>
Footnote-Link
</div> : '' }
{ textItem.parsedElements && textItem.parsedElements.containLinks ? <div>
Link
</div> : '' }
{ textItem.lineFormat ? <div>
{ textItem.lineFormat.name }
</div> : '' }
{ textItem.unopenedFormat ? <div>
Unopened
{ ' ' + textItem.unopenedFormat.name }
</div> : '' }
{ textItem.parsedElements && textItem.parsedElements.inlineFormats > 0 ? <div>
{ textItem.parsedElements.inlineFormats + 'x Bold/Italic' }
</div> : '' }
{ textItem.unclosedFormat ? <div>
Unclosed
{ ' ' + textItem.unclosedFormat.name }
</div> : '' }
</div>
</td>
<td>

View File

@ -8,12 +8,11 @@ import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import CompleteFormats from './transformations/textitem/CompleteFormats.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -59,15 +58,14 @@ export default class AppState {
new VerticalToHorizontal(),
new PostprocessLines(),
new DetectTOC(),
new DetectListItems(),
new DetectHeaders(),
new CompleteFormats(),
new DetectListItems(),
new GatherBlocks(),
new DetectCodeQuoteBlocks(),
new DetectListLevels(),
// new DetectFormats(),
// new HeadlineToUppercase(),
new ToTextBlocks(),
new ToMarkdown()];

View File

@ -18,12 +18,14 @@ export class ParsedElements {
this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks;
this.inlineFormats = options.inlineFormats || 0;
}
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks;
this.inlineFormats = this.inlineFormats + parsedElements.inlineFormats;
}
}

View File

@ -3,4 +3,23 @@ import { Enum } from 'enumify';
export default class StringFormat extends Enum {
}
StringFormat.initEnum(['STANDARD', 'BOLD', 'OBLIQUE', 'BOLD_OBLIQUE'])
StringFormat.initEnum({
STANDARD: {
needFormat: false
},
BOLD: {
needFormat: true,
startSymbol: '**',
endSymbol: '**'
},
OBLIQUE: {
needFormat: true,
startSymbol: '_',
endSymbol: '_'
},
BOLD_OBLIQUE: {
needFormat: true,
startSymbol: '**_',
endSymbol: '_**'
}
})

View File

@ -13,6 +13,10 @@ export default class TextItem extends PageItem {
this.font = options.font;
this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent;
this.lineFormat = options.lineFormat;
this.unopenedFormat = options.unopenedFormat;
this.unclosedFormat = options.unclosedFormat;
}
}

View File

@ -7,10 +7,8 @@ import { sortByX } from '../textItemFunctions.jsx'
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemLineCompactor {
constructor(options) {
if (options) {
this.transformEmphasis = options.transformEmphasis || true;
}
constructor(fontToFormats) {
this.fontToFormats = fontToFormats;
}
// returns a CombineResult
@ -22,8 +20,10 @@ export default class TextItemLineCompactor {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
var combinedItem;
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
const [lineFormat, unopenedFormat, unclosedFormat] = this.addFormats(resolvedLineItems, parsedElements);
var combinedItem;
if (resolvedLineItems.length == 1) {
combinedItem = resolvedLineItems[0];
} else {
@ -51,9 +51,93 @@ export default class TextItemLineCompactor {
});
}
combinedItem.parsedElements = parsedElements;
combinedItem.lineFormat = lineFormat;
combinedItem.unopenedFormat = unopenedFormat;
combinedItem.unclosedFormat = unclosedFormat;
return combinedItem;
}
addFormats(resolvedLineItems, parsedElements) {
var inlineFormats = 0;
var openFormatType;
var openFormatItem;
var openFormatIndex;
var lastItem;
var lineFormat;
var unopenedFormat;
var unclosedFormat;
const addStartSymbol = () => {
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
...openFormatItem,
text: openFormatType.startSymbol + openFormatItem.text
}));
}
const addEndSymbol = (index) => {
resolvedLineItems.splice(index, 1, new TextItem({
...lastItem,
text: lastItem.text + openFormatType.endSymbol
}));
}
const addCompleteSymbol = () => {
resolvedLineItems.splice(openFormatIndex, 1, new TextItem({
...openFormatItem,
text: openFormatType.startSymbol + openFormatItem.text + openFormatType.endSymbol
}));
}
const rollupOpenFormat = (endIndex) => {
const formatFromBeginningOfLine = openFormatIndex == 0;
const formatToEndOfLine = endIndex == resolvedLineItems.length - 1;
if (formatFromBeginningOfLine) {
if (formatToEndOfLine) {
lineFormat = openFormatType;
} else {
unopenedFormat = openFormatType;
addEndSymbol(endIndex);
}
} else {
if (formatToEndOfLine) {
unclosedFormat = openFormatType;
addStartSymbol();
} else {
inlineFormats++;
if (lastItem === openFormatItem) {
addCompleteSymbol();
} else {
addStartSymbol();
addEndSymbol();
}
}
}
};
resolvedLineItems.slice().forEach((item, i) => {
const formatType = this.fontToFormats.get(item.font);
if (openFormatType) {
if (formatType !== openFormatType) { //closin existing format
rollupOpenFormat(i - 1);
openFormatType = formatType.needFormat ? formatType : null;
openFormatItem = formatType.needFormat ? item : null;
openFormatIndex = formatType.needFormat ? i : null;
}
} else {
if (formatType.needFormat) {
openFormatType = formatType;
openFormatItem = item;
openFormatIndex = i;
}
}
lastItem = item;
});
if (openFormatType) {
rollupOpenFormat(resolvedLineItems.length - 1);
}
parsedElements.inlineFormats = inlineFormats;
return [lineFormat, unopenedFormat, unclosedFormat];
}
resolveSpecialElements(lineItems) {
const footnoteLinks = [];
const footnotes = [];

View File

@ -57,11 +57,11 @@ export default class CalculateGlobalStats extends ToTextItemTransformation {
var format;
if (key == mostUsedFont) {
format = StringFormat.STANDARD;
} else if (fontName.includes('bold') && fontName.includes('bold')) {
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
format = StringFormat.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) {
format = StringFormat.BOLD;
} else if (fontName.includes('oblique')) {
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
format = StringFormat.OBLIQUE;
} else if (fontName === maxHeightFont) {
format = StringFormat.BOLD;

View File

@ -2,6 +2,7 @@ import React from 'react';
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import { ParsedElements } from '../../PageItem.jsx';
import TextItemLineGrouper from '../../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../../TextItemLineCompactor.jsx';
import ElementType from '../../ElementType.jsx';
@ -16,13 +17,18 @@ export default class CompactLines extends ToTextItemTransformation {
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const {mostUsedDistance, fontToFormats} = parseResult.globals;
const foundFootnotes = [];
const foundFootnoteLinks = [];
var inlineFormats = 0;
var lineFormats = 0;
var unopenedFormats = 0;
var unclosedFormats = 0;
const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance,
});
const lineCompactor = new TextItemLineCompactor();
const lineCompactor = new TextItemLineCompactor(fontToFormats);
parseResult.pages.forEach(page => {
if (page.items.length > 0) {
@ -32,6 +38,13 @@ export default class CompactLines extends ToTextItemTransformation {
var lineItem;
if (textItemsOfLine.length == 1) {
lineItem = textItemsOfLine[0];
const formatType = fontToFormats.get(lineItem.font);
if (formatType.needFormat) {
lineItem.lineFormat = formatType;
lineItem.parsedElements = new ParsedElements({
completeLineFormats: 1
});
}
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
@ -50,7 +63,11 @@ export default class CompactLines extends ToTextItemTransformation {
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
inlineFormats += lineItem.parsedElements.inlineFormats;
}
if (lineItem.lineFormat) lineFormats++;
if (lineItem.unopenedFormat) unopenedFormats++;
if (lineItem.unclosedFormat) unclosedFormats++;
lineItem.text = lineItem.text.trim();
newItems.push(lineItem);
});
@ -62,9 +79,10 @@ export default class CompactLines extends ToTextItemTransformation {
return new ParseResult({
...parseResult,
messages: [
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
'Detected ' + lineFormats + ' line formats',
'Detected ' + inlineFormats + ' inline formats',
'Detected ' + unclosedFormats + ' opened un-closed formats',
'Detected ' + unopenedFormats + ' un-opened closed formats',
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
]

View File

@ -0,0 +1,170 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { UNCHANGED_ANNOTATION, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../../Annotation.jsx';
//Complete unopened/unclosed bold/italic formats
export default class CompleteFormats extends ToTextItemTransformation {
constructor() {
super("Complete Bold/Italics");
}
transform(parseResult:ParseResult) {
// remove line formats from headers
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (item.type && item.type.headline) {
if (item.lineFormat || item.unopenedFormat || item.unclosedFormat) {
item.lineFormat = null;
item.unopenedFormat = null;
item.unclosedFormat = null;
item.annotation = UNCHANGED_ANNOTATION;
}
}
});
});
//close open formats
parseResult.pages.forEach(page => {
const itemStack = new ItemStack();
page.items.forEach(item => {
itemStack.consume(item);
});
page.items = itemStack.getResults();
});
return new ParseResult({
...parseResult,
messages: []
});
}
}
class ItemStack {
constructor() {
this.openFormat;
this.openFormatItem = [];
this.resultItems = [];
}
cache(textItem, format) {
this.openFormat = format;
this.openFormatItem = textItem;
}
closeOpenFormat() {
if (this.openFormat) {
this.openFormatItem.annotation = REMOVED_ANNOTATION;
this.writeToResults(textItemWithClosing(this.openFormatItem, this.openFormat));
this.clear();
}
}
clear() {
this.openFormat = null;
this.openFormatItem = null;
}
writeToResults(textItem) {
this.resultItems.push(textItem);
}
getResults() {
if (this.openFormat) {
this.closeOpenFormat();
}
return this.resultItems;
}
consume(item) {
const te = item.text;
var newItem;
const handleFreshUnopened = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.unopenedFormat);
}
const handleFreshLine = () => {
item.annotation = REMOVED_ANNOTATION;
newItem = textItemWithOpening(item, item.lineFormat);
this.cache(newItem, item.lineFormat);
}
const handleFreshUnclosed = () => {
if (newItem) {
this.cache(newItem, item.unclosedFormat);
newItem = null;
} else {
this.cache(item, item.unclosedFormat);
}
}
//flush open format if possible
if (this.openFormat) {
if (item.unopenedFormat) {
if (item.unopenedFormat === this.openFormat) {
//good, closing an opened
this.clear();
} else {
this.closeOpenFormat();
handleFreshUnopened();
}
}
if (item.lineFormat) {
if (item.lineFormat === this.openFormat) {
this.cache(item, item.lineFormat);
} else {
this.closeOpenFormat();
handleFreshLine();
}
}
if (item.unclosedFormat) {
this.closeOpenFormat();
handleFreshUnclosed();
}
if (!item.unopenedFormat && !item.lineFormat && !item.unclosedFormat) {
this.closeOpenFormat();
}
} else { // handle fresh items
if (item.unopenedFormat) {
handleFreshUnopened()
}
if (item.lineFormat) {
handleFreshLine();
}
if (item.unclosedFormat) {
handleFreshUnclosed();
}
}
this.writeToResults(item);
if (newItem) {
this.writeToResults(newItem);
}
}
}
function textItemWithOpening(textItem, format) {
return new TextItem({
...textItem,
text: format.startSymbol + textItem.text,
annotation: ADDED_ANNOTATION
});
}
function textItemWithClosing(textItem, format) {
return new TextItem({
...textItem,
text: textItem.text + format.endSymbol,
annotation: ADDED_ANNOTATION
});
}

View File

@ -3,6 +3,7 @@ import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import ElementType from '../../ElementType.jsx';
import { headlineByLevel } from '../../ElementType.jsx';
import { isListItem } from '../../../functions.jsx';
//Detect items starting with -, , etc...
export default class DetectHeaders extends ToTextItemTransformation {
@ -56,7 +57,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
var lastHeight;
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height > mostUsedHeight) {
if (!textItem.type && textItem.height > mostUsedHeight && !isListItem(textItem.text)) {
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
heights.push(textItem.height);
}
@ -69,7 +70,7 @@ export default class DetectHeaders extends ToTextItemTransformation {
const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == height) {
if (!textItem.type && textItem.height == height && !isListItem(textItem.text)) {
detectedHeaders++;
textItem.annotation = DETECTED_ANNOTATION;
textItem.type = headlineType;