diff --git a/src/javascript/components/App.jsx b/src/javascript/components/App.jsx index 080e1a5..747b043 100644 --- a/src/javascript/components/App.jsx +++ b/src/javascript/components/App.jsx @@ -28,10 +28,10 @@ export default class App extends React.Component { mainView = break; case View.RESULT: - mainView = + mainView = break; case View.DEBUG: - mainView = + mainView = break; default: throw `View ${this.props.appState.mainView} not supported!`; @@ -46,7 +46,7 @@ export default class App extends React.Component { - ); + ); } } diff --git a/src/javascript/components/DebugView.jsx b/src/javascript/components/DebugView.jsx index 5aaa42e..906c793 100644 --- a/src/javascript/components/DebugView.jsx +++ b/src/javascript/components/DebugView.jsx @@ -17,7 +17,7 @@ import ParseResult from '../models/ParseResult.jsx'; export default class DebugView extends React.Component { static propTypes = { - pdfPages: React.PropTypes.array.isRequired, + pages: React.PropTypes.array.isRequired, transformations: React.PropTypes.array.isRequired, }; @@ -71,12 +71,12 @@ export default class DebugView extends React.Component { render() { const {currentTransformation, pageNr} = this.state; - const {pdfPages, transformations} = this.props; + const {pages, transformations} = this.props; const currentTransformationName = transformations[currentTransformation].name; var parseResult = new ParseResult({ - content: pdfPages + pages: pages }); var lastTransformation; for (var i = 0; i <= currentTransformation; i++) { @@ -87,8 +87,8 @@ export default class DebugView extends React.Component { lastTransformation = transformations[i]; } - parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr); - const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly)); + parseResult.pages = parseResult.pages.filter((elem, i) => pageNr == -1 || i == pageNr); + const pageComponents = parseResult.pages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly)); const showModificationCheckbox = lastTransformation.showModificationCheckbox(); const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => { return
  • @@ -121,7 +121,7 @@ export default class DebugView extends React.Component { last ellipsis boundaryLinks - items={ pdfPages.length } + items={ pages.length } maxButtons={ 17 } activePage={ this.state.pageNr + 1 } onSelect={ this.selectPage.bind(this) } /> @@ -194,6 +194,6 @@ export default class DebugView extends React.Component { { pageComponents } - ); + ); } } \ No newline at end of file diff --git a/src/javascript/components/LoadingView.jsx b/src/javascript/components/LoadingView.jsx index 4d1c44a..b0748f3 100644 --- a/src/javascript/components/LoadingView.jsx +++ b/src/javascript/components/LoadingView.jsx @@ -3,7 +3,7 @@ import React from 'react'; import pdfjs from 'pdfjs-dist'; // eslint-disable-line no-unused-vars import { Line } from 'rc-progress'; -import PdfPage from '../models/PdfPage.jsx'; +import Page from '../models/Page.jsx'; import TextItem from '../models/TextItem.jsx'; export default class LoadingView extends React.Component { @@ -17,19 +17,19 @@ export default class LoadingView extends React.Component { super(props); this.state = { parsedPages: 0, - pdfPages: [] + pages: [] }; } - anounceInitialParse(pdfPages) { + anounceInitialParse(pages) { this.setState({ - pdfPages: pdfPages + pages: pages }); } anouncePageParsed(index, textItems) { //TODO might make problems.. concat unordered and order at the end ? - this.state.pdfPages[index].textItems = textItems; // eslint-disable-line react/no-direct-mutation-state + this.state.pages[index].items = textItems; // eslint-disable-line react/no-direct-mutation-state this.setState({ parsedPages: this.state.parsedPages + 1 }); @@ -44,13 +44,13 @@ export default class LoadingView extends React.Component { // console.debug(pdfDocument); const numPages = pdfDocument.numPages; // const numPages = 4; // hack - var pdfPages = []; + var pages = []; for (var i = 0; i < numPages; i++) { - pdfPages.push(new PdfPage({ + pages.push(new Page({ index: i })); } - anounceInitialParseFunction(pdfPages); + anounceInitialParseFunction(pages); for (var j = 1; j <= numPages; j++) { pdfDocument.getPage(j).then(function(page) { var scale = 1.0; @@ -96,14 +96,14 @@ export default class LoadingView extends React.Component { } render() { - const {parsedPages, pdfPages} = this.state; + const {parsedPages, pages} = this.state; var percentDone = 0; var details = ''; - if (pdfPages.length > 0) { - percentDone = parsedPages / pdfPages.length * 100; - details = parsedPages + ' / ' + pdfPages.length - if (parsedPages == pdfPages.length) { - this.props.storePdfPagesFunction(this.state.pdfPages); + if (pages.length > 0) { + percentDone = parsedPages / pages.length * 100; + details = parsedPages + ' / ' + pages.length + if (parsedPages == pages.length) { + this.props.storePdfPagesFunction(this.state.pages); } } return ( diff --git a/src/javascript/components/ResultView.jsx b/src/javascript/components/ResultView.jsx index 2111286..ab4489a 100644 --- a/src/javascript/components/ResultView.jsx +++ b/src/javascript/components/ResultView.jsx @@ -10,7 +10,7 @@ import ParseResult from '../models/ParseResult.jsx'; export default class ResultView extends React.Component { static propTypes = { - pdfPages: React.PropTypes.array.isRequired, + pages: React.PropTypes.array.isRequired, transformations: React.PropTypes.array.isRequired, }; @@ -19,9 +19,9 @@ export default class ResultView extends React.Component { } componentWillMount() { - const {pdfPages, transformations} = this.props; + const {pages, transformations} = this.props; var parseResult = new ParseResult({ - content: pdfPages + pages: pages }); var lastTransformation; transformations.forEach(transformation => { @@ -32,10 +32,15 @@ export default class ResultView extends React.Component { lastTransformation = transformation; }); + var text = ''; + parseResult.pages.forEach(page => { + page.items.forEach(item => { + text += item + '\n'; + }); + }); this.state = { preview: true, - text: parseResult.content[0].text - + text: text }; } @@ -90,7 +95,7 @@ export default class ResultView extends React.Component {
    { textComponent } - ); + ); } } \ No newline at end of file diff --git a/src/javascript/components/debug/BlockPageView.jsx b/src/javascript/components/debug/BlockPageView.jsx deleted file mode 100644 index 6d35114..0000000 --- a/src/javascript/components/debug/BlockPageView.jsx +++ /dev/null @@ -1,49 +0,0 @@ -import React from 'react'; - -import Table from 'react-bootstrap/lib/Table' - -export default class BlockPageView extends React.Component { - - static propTypes = { - page: React.PropTypes.object.isRequired, - }; - - render() { - var blocks = this.props.page.blocks; - - const content =
    - - - - - - - - - - { blocks.map((block, i) => - - - - - ) } - -
    - # - - Category - - Text -
    - { i } - - { block.category } - -
    { block.text }
    -
    -
    - return ( - content - ); - } -} \ No newline at end of file diff --git a/src/javascript/components/debug/MarkdownPageView.jsx b/src/javascript/components/debug/MarkdownPageView.jsx index 0f60503..1b979ea 100644 --- a/src/javascript/components/debug/MarkdownPageView.jsx +++ b/src/javascript/components/debug/MarkdownPageView.jsx @@ -1,23 +1,17 @@ import React from 'react'; +import PageView from './PageView.jsx'; import Remarkable from 'remarkable'; -export default class MarkdownPageView extends React.Component { +export default class MarkdownPageView extends PageView { - static propTypes = { - page: React.PropTypes.object.isRequired, - }; - - render() { + createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars const remarkable = new Remarkable({ breaks: true }); - - const html = remarkable.render(this.props.page.text); - return ( -
    -
    -
    - ); + const html = remarkable.render(items[0]); + return
    +
    +
    } } \ No newline at end of file diff --git a/src/javascript/components/debug/PageView.jsx b/src/javascript/components/debug/PageView.jsx new file mode 100644 index 0000000..b79f13c --- /dev/null +++ b/src/javascript/components/debug/PageView.jsx @@ -0,0 +1,41 @@ +import React from 'react'; + +// Abstract view for a Page +export default class PageView extends React.Component { + + static propTypes = { + page: React.PropTypes.object.isRequired, + modificationsOnly: React.PropTypes.bool, + showWhitespaces: React.PropTypes.bool + }; + + createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars + throw new TypeError("Do not call abstract method foo from child."); + } + + render() { + const {page, modificationsOnly, showWhitespaces} = this.props; + + var items = page.items; + if (modificationsOnly) { + items = items.filter(block => block.annotation); + } + + + var content; + if (items.length == 0 && modificationsOnly) { + content =
    + } else { + const itemViews = this.createItemViews(items, showWhitespaces); + const header = "Page " + (page.index + 1); + content =
    +

    { header }

    +
    + { itemViews } +
    + } + return ( + content + ); + } +} \ No newline at end of file diff --git a/src/javascript/components/debug/PdfPageView.jsx b/src/javascript/components/debug/PdfPageView.jsx deleted file mode 100644 index a6f71aa..0000000 --- a/src/javascript/components/debug/PdfPageView.jsx +++ /dev/null @@ -1,35 +0,0 @@ -import React from 'react'; -import TextItemTable from './TextItemTable.jsx'; - -// View for a PdfPage -export default class PdfPageView extends React.Component { - - static propTypes = { - pdfPage: React.PropTypes.object.isRequired, - modificationsOnly: React.PropTypes.bool.isRequired, - showWhitespaces: React.PropTypes.bool - }; - - render() { - const {pdfPage, modificationsOnly, showWhitespaces} = this.props; - const header = "Page " + (pdfPage.index + 1); - var textItems = pdfPage.textItems; - if (modificationsOnly) { - textItems = textItems.filter(item => item.annotation); - } - - var content; - if (textItems.length == 0 && modificationsOnly) { - content =
    - } else { - content =
    -

    { header }

    - -
    - } - - return ( - content - ); - } -} \ No newline at end of file diff --git a/src/javascript/components/debug/PdfBlockPageView.jsx b/src/javascript/components/debug/TextItemBlockPageView.jsx similarity index 64% rename from src/javascript/components/debug/PdfBlockPageView.jsx rename to src/javascript/components/debug/TextItemBlockPageView.jsx index 706d57b..c0bb89c 100644 --- a/src/javascript/components/debug/PdfBlockPageView.jsx +++ b/src/javascript/components/debug/TextItemBlockPageView.jsx @@ -1,24 +1,12 @@ import React from 'react'; +import PageView from './PageView.jsx'; import TextItemTable from './TextItemTable.jsx'; -// View for a PdfBlockPage -export default class PdfBlockPageView extends React.Component { +// View for a Page which items are of kind TextItemBlock +export default class TextItemBlockPageView extends PageView { - static propTypes = { - pdfPage: React.PropTypes.object.isRequired, - modificationsOnly: React.PropTypes.bool.isRequired, - showWhitespaces: React.PropTypes.bool - }; - - render() { - const {pdfPage, modificationsOnly, showWhitespaces} = this.props; - - var blocks = pdfPage.blocks; - if (modificationsOnly) { - blocks = blocks.filter(block => block.annotation); - } - - const blockTables = blocks.map((block, i) => { + createItemViews(items, showWhitespaces) { + const blockTables = items.map((block, i) => { var textItems = block.textItems; const blockType = block.type ? ' - ' + block.type : null; const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category } @@ -56,19 +44,7 @@ export default class PdfBlockPageView extends React.Component {
    }); - - var content; - if (blocks.length == 0 && modificationsOnly) { - content =
    - } else { - const header = "Page " + (pdfPage.index + 1); - content =
    -

    { header }

    - { blockTables } -
    - } - return ( - content - ); + return blockTables; } + } \ No newline at end of file diff --git a/src/javascript/components/debug/TextItemPageView.jsx b/src/javascript/components/debug/TextItemPageView.jsx new file mode 100644 index 0000000..7a3d628 --- /dev/null +++ b/src/javascript/components/debug/TextItemPageView.jsx @@ -0,0 +1,12 @@ +import React from 'react'; +import PageView from './PageView.jsx'; +import TextItemTable from './TextItemTable.jsx'; + +// View for a Page which items are of kind TextItem +export default class TextItemPageView extends PageView { + + createItemViews(items, showWhitespaces) { + return + } + +} \ No newline at end of file diff --git a/src/javascript/components/debug/TextPageView.jsx b/src/javascript/components/debug/TextPageView.jsx new file mode 100644 index 0000000..90badd9 --- /dev/null +++ b/src/javascript/components/debug/TextPageView.jsx @@ -0,0 +1,41 @@ +import React from 'react'; +import PageView from './PageView.jsx'; +import Table from 'react-bootstrap/lib/Table' + +export default class TextPageView extends PageView { + + createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars + return
    + + + + + + + + + + { items.map((block, i) => + + + + + ) } + +
    + # + + Category + + Text +
    + { i } + + { block.category } + +
    { block.text }
    +
    +
    + } + +} \ No newline at end of file diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index f608d90..b4fea84 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -26,7 +26,7 @@ export default class AppState { this.renderFunction = options.renderFunction; this.mainView = View.UPLOAD; this.fileBuffer; - this.pdfPages = []; + this.pages = []; this.transformations = [ new CalculateGlobalStats(), new RemoveRepetitiveElements(), @@ -66,8 +66,8 @@ export default class AppState { this.render() } - storePdfPages(pdfPages) { - this.pdfPages = pdfPages; + storePdfPages(pages) { + this.pages = pages; this.fileBuffer = null; this.mainView = View.RESULT; this.render(); diff --git a/src/javascript/models/BlockPage.jsx b/src/javascript/models/BlockPage.jsx deleted file mode 100644 index a61970f..0000000 --- a/src/javascript/models/BlockPage.jsx +++ /dev/null @@ -1,9 +0,0 @@ -// A page which holds blocks displayable via BlockPageView -export default class BlockPage { - - constructor(options) { - this.index = options.index; - this.blocks = options.blocks; - } - -} diff --git a/src/javascript/models/MarkdownElements.jsx b/src/javascript/models/MarkdownElements.jsx index 3c5371b..1002220 100644 --- a/src/javascript/models/MarkdownElements.jsx +++ b/src/javascript/models/MarkdownElements.jsx @@ -1,4 +1,4 @@ -import PdfBlock from './BlockPage.jsx'; +import TextItemBlock from './TextItemBlock.jsx'; import TextItemCombiner from './TextItemCombiner.jsx'; import TextItem from './TextItem.jsx'; @@ -31,7 +31,7 @@ export function headlineByLevel(level) { throw "Unsupported headline level: " + level; } -export function blockToText(block: PdfBlock) { +export function blockToText(block: TextItemBlock) { switch (block.type) { case CODE_BLOCK: return '```\n' + concatTextItems(block.textItems) + '```' diff --git a/src/javascript/models/Page.jsx b/src/javascript/models/Page.jsx new file mode 100644 index 0000000..63a4820 --- /dev/null +++ b/src/javascript/models/Page.jsx @@ -0,0 +1,9 @@ +// A page which holds PageItems displayable via PdfPageView +export default class Page { + + constructor(options) { + this.index = options.index; + this.items = options.items || []; //PageItem + } + +} diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx new file mode 100644 index 0000000..cb01549 --- /dev/null +++ b/src/javascript/models/PageItem.jsx @@ -0,0 +1,13 @@ +// A abstract PageItem class, can be TextItem, or TextItemBlock +export default class PageItem { + + constructor(options) { + if (this.constructor === PageItem) { + throw new TypeError("Can not construct abstract class."); + } + this.type = options.type; + this.annotation = options.annotation; + this.parsedElements = options.parsedElements; + } + +} diff --git a/src/javascript/models/ParseResult.jsx b/src/javascript/models/ParseResult.jsx index 709e1a6..eb2bdc1 100644 --- a/src/javascript/models/ParseResult.jsx +++ b/src/javascript/models/ParseResult.jsx @@ -2,7 +2,7 @@ export default class ParseResult { constructor(options) { - this.content = options.content; // like PdfPages[] + this.pages = options.pages; // like Page[] this.globals = options.globals; // properties accasable for all the following transformations in debug mode this.messages = options.messages; // something to show only for the transformation in debug mode } diff --git a/src/javascript/models/PdfBlock.jsx b/src/javascript/models/PdfBlock.jsx deleted file mode 100644 index 3423cce..0000000 --- a/src/javascript/models/PdfBlock.jsx +++ /dev/null @@ -1,11 +0,0 @@ -// A block within a PdfPage -export default class PdfBlock { - - constructor(options) { - this.textItems = options.textItems; - this.type = options.type; - this.annotation = options.annotation; - this.parsedElements = options.parsedElements; - } - -} diff --git a/src/javascript/models/PdfBlockPage.jsx b/src/javascript/models/PdfBlockPage.jsx deleted file mode 100644 index aca6ab8..0000000 --- a/src/javascript/models/PdfBlockPage.jsx +++ /dev/null @@ -1,9 +0,0 @@ -// A page which holds TextItems grouped by block displayable via PdfPageBlockView -export default class PdfBlockPage { - - constructor(options) { - this.index = options.index; - this.blocks = options.blocks; - } - -} diff --git a/src/javascript/models/PdfPage.jsx b/src/javascript/models/PdfPage.jsx deleted file mode 100644 index 36a163b..0000000 --- a/src/javascript/models/PdfPage.jsx +++ /dev/null @@ -1,9 +0,0 @@ -// A page which holds TextItems displayable via PdfPageView -export default class PdfPage { - - constructor(options) { - this.index = options.index; - this.textItems = [] - } - -} diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx index 2fa1a33..fb1ab2c 100644 --- a/src/javascript/models/TextItem.jsx +++ b/src/javascript/models/TextItem.jsx @@ -1,7 +1,10 @@ +import PageItem from './PageItem.jsx' + //A text item, i.e. a line or a word within a page -export default class TextItem { +export default class TextItem extends PageItem { constructor(options) { + super(options); this.x = options.x; this.y = options.y; this.width = options.width; @@ -10,8 +13,6 @@ export default class TextItem { this.font = options.font; this.fontAscent = options.fontAscent; this.fontDescent = options.fontDescent; - this.annotation = options.annotation; - this.markdownElement = options.markdownElement; } } diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx new file mode 100644 index 0000000..614c422 --- /dev/null +++ b/src/javascript/models/TextItemBlock.jsx @@ -0,0 +1,11 @@ +import PageItem from './PageItem.jsx' + +// A block of TextItem[] within a Page +export default class TextItemBlock extends PageItem { + + constructor(options) { + super(options); + this.textItems = options.textItems; + } + +} diff --git a/src/javascript/models/TextPage.jsx b/src/javascript/models/TextPage.jsx deleted file mode 100644 index 88f2806..0000000 --- a/src/javascript/models/TextPage.jsx +++ /dev/null @@ -1,9 +0,0 @@ -// A page which holds TextItems displayable via PdfPageView -export default class TextPage { - - constructor(options) { - this.index = options.index; - this.text = options.text; - } - -} diff --git a/src/javascript/models/markdown/Headline.jsx b/src/javascript/models/markdown/Headline.jsx deleted file mode 100644 index c633a23..0000000 --- a/src/javascript/models/markdown/Headline.jsx +++ /dev/null @@ -1,17 +0,0 @@ -import MarkdownElement from './MarkdownElement.jsx'; - -export default class Headline extends MarkdownElement { - - constructor(options) { - super({ - newLineBefore: true, - newLineAfter: true - }); - this.level = options.level; - } - - transformText(text) { - return '#'.repeat(this.level) + ' ' + text; - } - -} diff --git a/src/javascript/models/markdown/MarkdownElement.jsx b/src/javascript/models/markdown/MarkdownElement.jsx deleted file mode 100644 index c860e7d..0000000 --- a/src/javascript/models/markdown/MarkdownElement.jsx +++ /dev/null @@ -1,16 +0,0 @@ -// An text item detected as markdown element -export default class MarkdownElement { - - constructor(options) { - if (this.constructor === MarkdownElement) { - throw new TypeError("Can not construct abstract class."); - } - this.newLineBefore = options.newLineBefore; - this.newLineAfter = options.newLineAfter; - } - - transformText(text) { // eslint-disable-line no-unused-vars - throw new TypeError("Do not call abstract method foo from child."); - } - -} diff --git a/src/javascript/models/transformations/CalculateGlobalStats.jsx b/src/javascript/models/transformations/CalculateGlobalStats.jsx index 4d6c8a3..b34d080 100644 --- a/src/javascript/models/transformations/CalculateGlobalStats.jsx +++ b/src/javascript/models/transformations/CalculateGlobalStats.jsx @@ -1,7 +1,7 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; -export default class CalculateGlobalStats extends ToPdfViewTransformation { +export default class CalculateGlobalStats extends ToTextItemTransformation { constructor() { super("Calculate Statistics"); @@ -14,8 +14,8 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation { const fontToOccurrence = {}; var maxHeight = 0; var maxHeightFont; - parseResult.content.forEach(page => { - page.textItems.forEach(item => { + parseResult.pages.forEach(page => { + page.items.forEach(item => { heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1; fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1; if (item.height > maxHeight) { @@ -29,9 +29,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation { // Parse line distances const distanceToOccurrence = {}; - parseResult.content.forEach(page => { + parseResult.pages.forEach(page => { var lastItemOfMostUsedHeight; - page.textItems.forEach(item => { + page.items.forEach(item => { if (item.height == mostUsedHeight && item.text.trim().length > 0) { if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) { const distance = lastItemOfMostUsedHeight.y - item.y; @@ -49,10 +49,10 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation { //Make a copy of the originals so all following transformation don't modify them - const newContent = parseResult.content.map(pdfPage => { + const newPages = parseResult.pages.map(page => { return { - ...pdfPage, - textItems: pdfPage.textItems.map(textItem => { + ...page, + items: page.items.map(textItem => { return { ...textItem, } @@ -61,7 +61,7 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation { }); return new ParseResult({ ...parseResult, - content: newContent, + pages: newPages, globals: { mostUsedHeight: mostUsedHeight, mostUsedFont: mostUsedFont, diff --git a/src/javascript/models/transformations/CombineSameY.jsx b/src/javascript/models/transformations/CombineSameY.jsx index 402067f..eb3e2b8 100644 --- a/src/javascript/models/transformations/CombineSameY.jsx +++ b/src/javascript/models/transformations/CombineSameY.jsx @@ -1,4 +1,4 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import TextItem from '../TextItem.jsx'; import ParseResult from '../ParseResult.jsx'; import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; @@ -40,7 +40,7 @@ function combineTextItems(textItems:TextItem[]) { }); } -export default class CombineSameY extends ToPdfViewTransformation { +export default class CombineSameY extends ToTextItemTransformation { constructor() { super("Combine Text On Same Y"); diff --git a/src/javascript/models/transformations/DetectCodeBlocks.jsx b/src/javascript/models/transformations/DetectCodeBlocks.jsx index ff8c4b2..486b220 100644 --- a/src/javascript/models/transformations/DetectCodeBlocks.jsx +++ b/src/javascript/models/transformations/DetectCodeBlocks.jsx @@ -1,13 +1,13 @@ -import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import PdfBlock from '../PdfBlock.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; import TextItemCombiner from '../TextItemCombiner.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { CODE_BLOCK } from '../MarkdownElements.jsx'; import { minXFromBlocks } from '../../textItemFunctions.jsx'; //Detect quotes, code etc.. which is transformed to markdown code syntax -export default class DetectCodeBlocks extends ToPdfBlockViewTransformation { +export default class DetectCodeBlocks extends ToTextItemBlockTransformation { constructor() { super("Detect Code/Quotes"); @@ -21,8 +21,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation { mostUsedDistance: mostUsedDistance }); - parseResult.content.forEach(page => { - var minX = minXFromBlocks(page.blocks); + parseResult.pages.forEach(page => { + var minX = minXFromBlocks(page.items); if (minX) { const itemAreSuitable = (items) => { for ( let item of items ) { @@ -37,7 +37,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation { }; const newBlocks = []; var preceedingCodeBlock; - page.blocks.forEach(block => { + page.items.forEach(block => { if (block.type) { newBlocks.push(block); preceedingCodeBlock = null; @@ -54,7 +54,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation { preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems); preceedingCodeBlock.parsedElements.add(combineResult.parsedElements); } else { - preceedingCodeBlock = new PdfBlock({ + preceedingCodeBlock = new TextItemBlock({ type: CODE_BLOCK, annotation: ADDED_ANNOTATION, textItems: combineResult.textItems, @@ -69,7 +69,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation { } } }); - page.blocks = newBlocks; + page.items = newBlocks; } }); diff --git a/src/javascript/models/transformations/DetectFootnotes.jsx b/src/javascript/models/transformations/DetectFootnotes.jsx index 96282a8..6d9c98c 100644 --- a/src/javascript/models/transformations/DetectFootnotes.jsx +++ b/src/javascript/models/transformations/DetectFootnotes.jsx @@ -1,12 +1,12 @@ -import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import PdfBlock from '../PdfBlock.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; import TextItemCombiner from '../TextItemCombiner.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx'; //Detect quotes, code etc.. which is transformed to markdown code syntax -export default class DetectFootnotes extends ToPdfBlockViewTransformation { +export default class DetectFootnotes extends ToTextItemBlockTransformation { constructor() { super("Detect Footnotes"); @@ -19,17 +19,17 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation { mostUsedDistance: mostUsedDistance, }); - parseResult.content.forEach(page => { + parseResult.pages.forEach(page => { const newBlocks = []; var lastFootnote; - page.blocks.forEach(block => { + page.items.forEach(block => { newBlocks.push(block); if (!block.type && block.textItems[0].y < 200) { const combineResult = textCombiner.combine(block.textItems); if (combineResult.parsedElements.footnotes.length > 0) { block.annotation = REMOVED_ANNOTATION; foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes); - lastFootnote = new PdfBlock({ + lastFootnote = new TextItemBlock({ textItems: combineResult.textItems, type: FOOTNOTE_BLOCK, annotation: ADDED_ANNOTATION, @@ -48,7 +48,7 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation { lastFootnote = null; } }); - page.blocks = newBlocks; + page.items = newBlocks; }); return new ParseResult({ diff --git a/src/javascript/models/transformations/DetectHeadlines.jsx b/src/javascript/models/transformations/DetectHeadlines.jsx new file mode 100644 index 0000000..44c847a --- /dev/null +++ b/src/javascript/models/transformations/DetectHeadlines.jsx @@ -0,0 +1,198 @@ +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import ParseResult from '../ParseResult.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; +import TextItemCombiner from '../TextItemCombiner.jsx'; +import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; +import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx'; + +//Detect headlines +export default class DetectHeadlines extends ToTextItemBlockTransformation { + + constructor() { + super("Detect Headlines"); + } + + transform(parseResult:ParseResult) { + var foundHeadlines = 0; + const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals; + + const textCombiner = new TextItemCombiner({ + mostUsedDistance: mostUsedDistance, + }); + + //Set max headlines (all headers on the same page are max level 2) + const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner); + + var headlineHeightFlowBeforeToc = []; + var headlineHeightsOccurenceBeforeToc = {}; + var firstPageAfterToc = 0; + if (tocPages && tocPages.length > 0) { + [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages); + firstPageAfterToc = tocPages[tocPages.length - 1] + 1; + } + + const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages); + + + // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd) + // TODO ==> parse seperately between beforeToc and after + // TODO ==> Kala chakra, all uppercase + // TODO ==> TOC headlines + + //var topHeadlinePassed = false; + const headlineHeightMap = {}; + const headlineSizePerLevel = {}; + var currentHeadlineLevel; + parseResult.pages.forEach(page => { + const newBlocks = []; + page.items.forEach(block => { + newBlocks.push(block); + if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { + const combineResult = textCombiner.combine(block.textItems); + if (combineResult.textItems.length == 1) { + const height = combineResult.textItems[0].height; + if (height == maxHeight) { + block.annotation = REMOVED_ANNOTATION; + currentHeadlineLevel = 1; + headlineSizePerLevel[currentHeadlineLevel] = height + addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); + } + // else if (currentHeadlineLevel) { + // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel]; + // if (height < currentLevelSize) { + // const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1]; + // // if(!nextLevelSize) + // if (currentHeadlineLevel < 6) { + // currentHeadlineLevel++; + // } + // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); + // headlineSizePerLevel[currentHeadlineLevel] = height; + // } else if (height > currentLevelSize) { + // const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1]; + // if (currentHeadlineLevel > 1) { + // currentHeadlineLevel--; + // } + // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); + // headlineSizePerLevel[currentHeadlineLevel] = height; + // } else { + // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); + // } + // } + } + } + }); + page.items = newBlocks; + }); + + const heightToOccurrence = {}; + const fontToOccurrence = {}; + // parseResult.content.forEach(page => { + // const newBlocks = []; + // page.blocks.forEach(block => { + // newBlocks.push(block); + // if (!block.type && block.textItems[0].height > mostUsedHeight) { + // foundHeadlines++; + // block.annotation = REMOVED_ANNOTATION; + // const combineResult = textCombiner.combine(block.textItems); + // const height = combineResult.textItems[0].height; + // const font = combineResult.textItems[0].font; + // heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1; + // fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1; + // newBlocks.push(new PdfBlock({ + // textItems: combineResult.textItems, + // type: HEADLINE1, + // annotation: ADDED_ANNOTATION, + // parsedElements: combineResult.parsedElements + // })); + // } + // }); + // page.blocks = newBlocks; + // }); + + return new ParseResult({ + ...parseResult, + messages: [ + 'Found headlines: ' + foundHeadlines, + 'Height repetition: ' + JSON.stringify(heightToOccurrence), + 'Font repetition: ' + JSON.stringify(fontToOccurrence), + 'Pages with max Header: ' + maxHeaderPages, + 'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc, + 'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc), + 'Headline Height Flow: ' + headlineHeightFlowAfterToc, + 'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc), + ] + }); + } + +} + +function addNewBlock(newBlocks, combineResult, headlineLevel) { + newBlocks.push(new TextItemBlock({ + textItems: combineResult.textItems, + type: headlineLevel, + annotation: ADDED_ANNOTATION, + parsedElements: combineResult.parsedElements + })); +} + +function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) { + // Find pages with max height + const maxHeaderPagesSet = new Set(); + pages.forEach(page => { + page.items.forEach(block => { + if (!block.type && block.textItems[0].height == maxHeight) { + maxHeaderPagesSet.add(page); + } + }); + }); + + // Now convert those pages to headlines + const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); + maxHeaderPagesSet.forEach(pageWithMaxHeader => { + const newBlocks = []; + pageWithMaxHeader.items.forEach(block => { + newBlocks.push(block); + const height = block.textItems[0].height; + if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) { + block.annotation = REMOVED_ANNOTATION; + const combineResult = textCombiner.combine(block.textItems); + if (height == maxHeight) { + addNewBlock(newBlocks, combineResult, HEADLINE1); + } else if (combineResult.textItems.length == 1) { + addNewBlock(newBlocks, combineResult, HEADLINE2); + } + } + }); + pageWithMaxHeader.items = newBlocks; + }); + + return Array.from(maxHeaderPagesSet).map(page => page.index + 1); +} + +function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) { + const headlineHeightFlow = []; + const headlineHeightsOccurences = {}; + var lastHeadlineHeight; + for (var i = from; i < to; i++) { + const page = pages[i]; + if (!maxHeaderPages.includes(page.index + 1)) { + page.items.forEach(block => { + if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { + const combineResult = textCombiner.combine(block.textItems); + if (combineResult.textItems.length == 1) { + const height = combineResult.textItems[0].height; + headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ; + if (!lastHeadlineHeight || height != lastHeadlineHeight) { + headlineHeightFlow.push(height); + //headlineFontFlow.push(combineResult.textItems[0].font) + lastHeadlineHeight = height; + } + } + } + }); + } + } + + return [headlineHeightFlow, headlineHeightsOccurences]; +} + diff --git a/src/javascript/models/transformations/DetectLists.jsx b/src/javascript/models/transformations/DetectLists.jsx index e2bd6f6..1ac0459 100644 --- a/src/javascript/models/transformations/DetectLists.jsx +++ b/src/javascript/models/transformations/DetectLists.jsx @@ -1,14 +1,14 @@ -import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItem from '../TextItem.jsx'; -import PdfBlock from '../PdfBlock.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; import TextItemCombiner from '../TextItemCombiner.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx'; import { minXFromBlocks } from '../../textItemFunctions.jsx'; //Detect quotes, code etc.. which is transformed to markdown code syntax -export default class DetectLists extends ToPdfBlockViewTransformation { +export default class DetectLists extends ToTextItemBlockTransformation { constructor() { super("Detect Lists"); @@ -21,11 +21,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation { mostUsedDistance: mostUsedDistance }); - parseResult.content.forEach(page => { - var minX = minXFromBlocks(page.blocks); + parseResult.pages.forEach(page => { + var minX = minXFromBlocks(page.items); if (minX) { const newBlocks = []; - page.blocks.forEach(block => { + page.items.forEach(block => { newBlocks.push(block); if (!block.type) { const combineResult = textCombiner.combine(block.textItems); @@ -81,14 +81,14 @@ export default class DetectLists extends ToPdfBlockViewTransformation { }); if (itemsBeforeFirstLineItem.length > 0) { - newBlocks.push(new PdfBlock({ + newBlocks.push(new TextItemBlock({ textItems: itemsBeforeFirstLineItem, type: PARAGRAPH, annotation: ADDED_ANNOTATION })); } //TODO display with whitespace pre support - newBlocks.push(new PdfBlock({ + newBlocks.push(new TextItemBlock({ textItems: listBlockItems, type: LIST_BLOCK, annotation: ADDED_ANNOTATION, @@ -97,7 +97,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation { } } }); - page.blocks = newBlocks; + page.items = newBlocks; } }); diff --git a/src/javascript/models/transformations/DetectPdfBlocks.jsx b/src/javascript/models/transformations/DetectPdfBlocks.jsx index 34e9d12..78d5fce 100644 --- a/src/javascript/models/transformations/DetectPdfBlocks.jsx +++ b/src/javascript/models/transformations/DetectPdfBlocks.jsx @@ -1,10 +1,10 @@ -import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import Page from '../Page.jsx'; import ParseResult from '../ParseResult.jsx'; -import PdfBlockPage from '../PdfBlockPage.jsx'; -import PdfBlock from '../PdfBlock.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; import { minXFromTextItems } from '../../textItemFunctions.jsx'; -export default class DetectPdfBlocks extends ToPdfBlockViewTransformation { +export default class DetectPdfBlocks extends ToTextItemBlockTransformation { constructor() { super("Detect Blocks"); @@ -13,20 +13,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation { transform(parseResult:ParseResult) { const {mostUsedDistance} = parseResult.globals; var createdBlocks = 0; - const newContent = parseResult.content.map(page => { - var minX = minXFromTextItems(page.textItems); + const newPages = parseResult.pages.map(page => { + var minX = minXFromTextItems(page.items); const blocks = []; var textItemsInBlock = []; const completBlock = () => { if (textItemsInBlock.length > 0) { //can happen on empty page - blocks.push(new PdfBlock({ + blocks.push(new TextItemBlock({ textItems: textItemsInBlock })); textItemsInBlock = []; } }; var lastItem; - page.textItems.forEach(item => { + page.items.forEach(item => { if (lastItem) { if (shouldSplit(lastItem, item, minX, mostUsedDistance)) { @@ -39,16 +39,16 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation { completBlock(); createdBlocks += blocks.length; - return new PdfBlockPage({ + return new Page({ ...page, - blocks: blocks + items: blocks }); }); return new ParseResult({ ...parseResult, - content: newContent, + pages: newPages, messages: ['Splitted into ' + createdBlocks + ' blocks'] }); } diff --git a/src/javascript/models/transformations/DetectTOC.jsx b/src/javascript/models/transformations/DetectTOC.jsx index b13a6ea..b39f17a 100644 --- a/src/javascript/models/transformations/DetectTOC.jsx +++ b/src/javascript/models/transformations/DetectTOC.jsx @@ -1,7 +1,7 @@ -import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItem from '../TextItem.jsx'; -import PdfBlock from '../PdfBlock.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; import TextItemCombiner from '../TextItemCombiner.jsx'; import HeadlineFinder from '../HeadlineFinder.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; @@ -9,16 +9,16 @@ import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx'; import { isDigit } from '../../functions.jsx' //Detect table of contents pages -export default class DetectTOC extends ToPdfBlockViewTransformation { +export default class DetectTOC extends ToTextItemBlockTransformation { constructor() { - super("Detect Table of Contents"); + super("Detect TOC"); } transform(parseResult:ParseResult) { const {mostUsedDistance} = parseResult.globals; const tocPages = []; - const maxPagesToEvaluate = Math.min(20, parseResult.content.length); + const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); const textCombiner = new TextItemCombiner({ mostUsedDistance: mostUsedDistance }); @@ -26,14 +26,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation { const linkLeveler = new LinkLeveler(); var tocLinks = []; var lastTocPage; - parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => { + parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { var linesCount = 0; var linesWithDigitsCount = 0; var lineItemsWithDigits = []; const unknownBlocks = new Set(); var headlineBlock; const pageTocLinks = []; - page.blocks.forEach(block => { + page.items.forEach(block => { var blockHasLinesWithDigits = false; const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; var lastLineTextWithoutNumber; @@ -87,20 +87,20 @@ export default class DetectTOC extends ToPdfBlockViewTransformation { tocLinks = tocLinks.concat(pageTocLinks); const newBlocks = []; - page.blocks.forEach((block) => { + page.items.forEach((block) => { if (!unknownBlocks.has(block)) { block.annotation = REMOVED_ANNOTATION; } newBlocks.push(block); if (block === headlineBlock) { - newBlocks.push(new PdfBlock({ + newBlocks.push(new TextItemBlock({ textItems: textCombiner.combine(block.textItems).textItems, type: HEADLINE2, annotation: ADDED_ANNOTATION })); } }); - page.blocks = newBlocks; + page.items = newBlocks; } }); @@ -109,12 +109,12 @@ export default class DetectTOC extends ToPdfBlockViewTransformation { const notFoundHeadlines = []; if (tocPages.length > 0) { tocLinks.forEach(tocLink => { - var linkedPage = parseResult.content[tocLink.pageNumber - 1]; + var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var foundHeadline = false; if (linkedPage) { foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); if (!foundHeadline) { // pages are off by 1 ? - linkedPage = parseResult.content[tocLink.pageNumber]; + linkedPage = parseResult.pages[tocLink.pageNumber]; if (linkedPage) { foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); } @@ -126,7 +126,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation { notFoundHeadlines.push(tocLink); } }); - lastTocPage.blocks.push(new PdfBlock({ + lastTocPage.items.push(new TextItemBlock({ textItems: tocLinks.map(tocLink => { tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text; return tocLink.textItem @@ -164,7 +164,7 @@ function findHeadline(page, tocLink, textCombiner) { }); var blockIndex = 0; var lastBlock; - for ( var block of page.blocks ) { + for ( var block of page.items ) { const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; for ( var item of itemsGroupedByY ) { const headlineItems = headlineFinder.consume(item); @@ -175,7 +175,7 @@ function findHeadline(page, tocLink, textCombiner) { // 2 line headline lastBlock.annotation = REMOVED_ANNOTATION; } - page.blocks.splice(blockIndex + 1, 0, new PdfBlock({ + page.items.splice(blockIndex + 1, 0, new TextItemBlock({ textItems: [new TextItem({ ...usedItems[0], text: headline diff --git a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx index bcb258c..b5e3a0c 100644 --- a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx +++ b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx @@ -1,4 +1,4 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import { REMOVED_ANNOTATION } from '../Annotation.jsx'; @@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) { // Remove elements with similar content on same page positions, like page numbers, licenes information, etc... -export default class RemoveRepetitiveElements extends ToPdfViewTransformation { +export default class RemoveRepetitiveElements extends ToTextItemTransformation { constructor() { super("Remove Repetitive Elements"); @@ -36,8 +36,8 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation { const pageStore = []; const minLineHashRepetitions = {}; const maxLineHashRepetitions = {}; - parseResult.content.forEach(pdfPage => { - const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => { + parseResult.pages.forEach(page => { + const minMaxItems = page.items.reduce((itemStore, item) => { if (item.y < itemStore.minY) { itemStore.minElements = [item]; itemStore.minY = item.y; @@ -73,14 +73,14 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation { // now annoate all removed items var removedHeader = 0; var removedFooter = 0; - parseResult.content.forEach((pdfPage, i) => { - if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) { + parseResult.pages.forEach((page, i) => { + if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) { pageStore[i].minElements.forEach(item => { item.annotation = REMOVED_ANNOTATION; }); removedFooter++; } - if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) { + if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) { pageStore[i].maxElements.forEach(item => { item.annotation = REMOVED_ANNOTATION; }); diff --git a/src/javascript/models/transformations/ToMarkdown.jsx b/src/javascript/models/transformations/ToMarkdown.jsx index 7b1cd17..2d9415b 100644 --- a/src/javascript/models/transformations/ToMarkdown.jsx +++ b/src/javascript/models/transformations/ToMarkdown.jsx @@ -2,7 +2,6 @@ import React from 'react'; import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx'; import Transformation from './Transformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import TextPage from '../TextPage.jsx'; export default class ToMarkdown extends Transformation { @@ -15,18 +14,15 @@ export default class ToMarkdown extends Transformation { } transform(parseResult:ParseResult) { - var text = ''; - parseResult.content.forEach(page => { - page.blocks.forEach((block) => { + parseResult.pages.forEach(page => { + var text = ''; + page.items.forEach(block => { text += block.text + '\n'; }); + page.items = [text]; }); return new ParseResult({ ...parseResult, - content: [new TextPage({ - index: 0, - text: text - })], }); } diff --git a/src/javascript/models/transformations/ToPdfViewTransformation.jsx b/src/javascript/models/transformations/ToPdfViewTransformation.jsx deleted file mode 100644 index 2bbd699..0000000 --- a/src/javascript/models/transformations/ToPdfViewTransformation.jsx +++ /dev/null @@ -1,45 +0,0 @@ -import React from 'react'; -import Transformation from './Transformation.jsx'; -import ParseResult from '../ParseResult.jsx'; -import PdfPageView from '../../components/debug/PdfPageView.jsx'; -import { REMOVED_ANNOTATION } from '../Annotation.jsx'; - -// Abstract class for transformations producing a PdfPage to be shown in the PdfView -export default class ToPdfViewTransformation extends Transformation { - - constructor(name) { - super(name); - if (this.constructor === ToPdfViewTransformation) { - throw new TypeError("Can not construct abstract class."); - } - this.showWhitespaces = false; - } - - showPageSelection() { - return true; - } - - showModificationCheckbox() { - return true; - } - - createPageView(page, modificationsOnly) { - return ; - } - - completeTransform(parseResult:ParseResult) { - // The usual cleanup - parseResult.messages = []; - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(block => block.annotation = null); - }); - return parseResult; - } - - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/ToTextBlocks.jsx b/src/javascript/models/transformations/ToTextBlocks.jsx index 315deaf..10d8b0f 100644 --- a/src/javascript/models/transformations/ToTextBlocks.jsx +++ b/src/javascript/models/transformations/ToTextBlocks.jsx @@ -1,8 +1,7 @@ import React from 'react'; import Transformation from './Transformation.jsx'; -import BlockPageView from '../../components/debug/BlockPageView.jsx'; +import TextPageView from '../../components/debug/TextPageView.jsx'; import ParseResult from '../ParseResult.jsx'; -import BlockPage from '../BlockPage.jsx'; import { blockToText } from '../MarkdownElements.jsx'; export default class ToTextBlocks extends Transformation { @@ -12,27 +11,23 @@ export default class ToTextBlocks extends Transformation { } createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars - return ; + return ; } transform(parseResult:ParseResult) { - const blocks = []; - parseResult.content.forEach(page => { - page.blocks.forEach(block => { + parseResult.pages.forEach(page => { + const textItems = []; + page.items.forEach(block => { const category = block.type ? block.type : 'Unknown'; - blocks.push({ + textItems.push({ category: category, text: blockToText(block) }); }); - + page.items = textItems; }); return new ParseResult({ ...parseResult, - content: [new BlockPage({ - index: 0, - blocks: blocks - })], }); } diff --git a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx b/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx new file mode 100644 index 0000000..2687615 --- /dev/null +++ b/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx @@ -0,0 +1,44 @@ +import React from 'react'; +import Transformation from './Transformation.jsx'; +import ParseResult from '../ParseResult.jsx'; +import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx'; +import { REMOVED_ANNOTATION } from '../Annotation.jsx'; + +// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView +export default class ToTextItemBlockTransformation extends Transformation { + + constructor(name) { + super(name); + if (this.constructor === ToTextItemBlockTransformation) { + throw new TypeError("Can not construct abstract class."); + } + this.showWhitespaces = false; + } + + showPageSelection() { + return true; + } + + showModificationCheckbox() { + return true; + } + + createPageView(page, modificationsOnly) { + return ; + } + + completeTransform(parseResult:ParseResult) { + // The usual cleanup + parseResult.messages = []; + parseResult.pages.forEach(page => { + page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION); + page.items.forEach(item => item.annotation = null); + }); + return parseResult; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx b/src/javascript/models/transformations/ToTextItemTransformation.jsx similarity index 59% rename from src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx rename to src/javascript/models/transformations/ToTextItemTransformation.jsx index 54db116..3514a56 100644 --- a/src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx +++ b/src/javascript/models/transformations/ToTextItemTransformation.jsx @@ -1,15 +1,15 @@ import React from 'react'; import Transformation from './Transformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx'; +import TextItemPageView from '../../components/debug/TextItemPageView.jsx'; import { REMOVED_ANNOTATION } from '../Annotation.jsx'; -// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView -export default class ToPdfBlockViewTransformation extends Transformation { +// Abstract class for transformations producing TextItem(s) to be shown in the TextItemPageView +export default class ToTextItemTransformation extends Transformation { constructor(name) { super(name); - if (this.constructor === ToPdfBlockViewTransformation) { + if (this.constructor === ToTextItemTransformation) { throw new TypeError("Can not construct abstract class."); } this.showWhitespaces = false; @@ -24,9 +24,9 @@ export default class ToPdfBlockViewTransformation extends Transformation { } createPageView(page, modificationsOnly) { - return ; } @@ -34,11 +34,12 @@ export default class ToPdfBlockViewTransformation extends Transformation { completeTransform(parseResult:ParseResult) { // The usual cleanup parseResult.messages = []; - parseResult.content.forEach(page => { - page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION); - page.blocks.forEach(block => block.annotation = null); + parseResult.pages.forEach(page => { + page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION); + page.items.forEach(item => item.annotation = null); }); return parseResult; } + } \ No newline at end of file diff --git a/src/javascript/models/transformations/Transformation.jsx b/src/javascript/models/transformations/Transformation.jsx index 58f2fb8..b5ff64b 100644 --- a/src/javascript/models/transformations/Transformation.jsx +++ b/src/javascript/models/transformations/Transformation.jsx @@ -14,7 +14,7 @@ export default class Transformation { } showPageSelection() { - return false; + return true; } showModificationCheckbox() { diff --git a/src/javascript/models/transformations/VerticalToHorizontal.jsx b/src/javascript/models/transformations/VerticalToHorizontal.jsx index b1435db..96fed1f 100644 --- a/src/javascript/models/transformations/VerticalToHorizontal.jsx +++ b/src/javascript/models/transformations/VerticalToHorizontal.jsx @@ -1,10 +1,10 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItem from '../TextItem.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; // Converts vertical text to horizontal -export default class VerticalToHorizontal extends ToPdfViewTransformation { +export default class VerticalToHorizontal extends ToTextItemTransformation { constructor() { super("Vertical to Horizontal Text"); @@ -12,7 +12,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation { transform(parseResult:ParseResult) { var foundVerticals = 0; - const newContent = parseResult.content.map(page => { + const newPages = parseResult.pages.map(page => { const newTextItems = []; // var oneCharacterItems = []; @@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation { //TODO generic state machine code ? - const leftOver = page.textItems.reduce((oneCharacterItems, item) => { + const leftOver = page.items.reduce((oneCharacterItems, item) => { if (item.text.trim().length == 1) { if (oneCharacterItems.length == 0) { oneCharacterItems.push(item); @@ -84,12 +84,12 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation { return { ...page, - textItems: newTextItems + items: newTextItems }; }); return new ParseResult({ ...parseResult, - content: newContent, + pages: newPages, messages: ["Converted " + foundVerticals + " verticals"] }); } diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/textItemFunctions.jsx index a60eeca..8c33ffc 100644 --- a/src/javascript/textItemFunctions.jsx +++ b/src/javascript/textItemFunctions.jsx @@ -1,7 +1,7 @@ -import PdfBlock from './models/PdfBlock.jsx'; +import TextItemBlock from './models/TextItemBlock.jsx'; import TextItem from './models/TextItem.jsx'; -export function minXFromBlocks(blocks:PdfBlock[]) { +export function minXFromBlocks(blocks:TextItemBlock[]) { var minX = 999; blocks.forEach(block => { block.textItems.forEach(item => {