mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-21 04:58:36 +01:00
[WIP] remove explicit Footnotes transformation
This commit is contained in:
parent
68e3fd7a9f
commit
15c5946073
@ -7,7 +7,6 @@ import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
|||||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
|
|
||||||
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
|
||||||
import DetectLists from './transformations/DetectLists.jsx'
|
import DetectLists from './transformations/DetectLists.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||||
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
import DetectHeadlines from './transformations/DetectHeadlines.jsx'
|
||||||
@ -37,7 +36,6 @@ export default class AppState {
|
|||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
|
|
||||||
new GatherBlocks(),
|
new GatherBlocks(),
|
||||||
new DetectFootnotes(),
|
|
||||||
new DetectLists(),
|
new DetectLists(),
|
||||||
new DetectCodeBlocks(),
|
new DetectCodeBlocks(),
|
||||||
new DetectHeadlines(),
|
new DetectHeadlines(),
|
||||||
|
@ -1,67 +0,0 @@
|
|||||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
|
||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
|
||||||
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
|
|
||||||
|
|
||||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
|
||||||
export default class DetectFootnotes extends ToTextItemBlockTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Detect Footnotes");
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
const {mostUsedDistance} = parseResult.globals;
|
|
||||||
var foundFootnotes = [];
|
|
||||||
const textCombiner = new TextItemCombiner({
|
|
||||||
mostUsedDistance: mostUsedDistance,
|
|
||||||
});
|
|
||||||
|
|
||||||
parseResult.pages.forEach(page => {
|
|
||||||
const newBlocks = [];
|
|
||||||
var lastFootnote;
|
|
||||||
page.items.forEach(block => {
|
|
||||||
newBlocks.push(block);
|
|
||||||
if (!block.type && block.textItems[0].y < 200) {
|
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
|
||||||
if (combineResult.parsedElements.footnotes.length > 0) {
|
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
|
||||||
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
|
|
||||||
lastFootnote = new TextItemBlock({
|
|
||||||
textItems: combineResult.textItems,
|
|
||||||
type: FOOTNOTE_BLOCK,
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
parsedElements: combineResult.parsedElements
|
|
||||||
});
|
|
||||||
newBlocks.push(lastFootnote);
|
|
||||||
} else if (lastFootnote) {
|
|
||||||
// likely to be the second line of aboves footnote
|
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
|
||||||
lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
|
|
||||||
lastFootnote.parsedElements.add(combineResult.parsedElements);
|
|
||||||
newBlocks[newBlocks.length - 2] = block;
|
|
||||||
newBlocks[newBlocks.length - 1] = lastFootnote;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
lastFootnote = null;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
page.items = newBlocks;
|
|
||||||
});
|
|
||||||
|
|
||||||
return new ParseResult({
|
|
||||||
...parseResult,
|
|
||||||
messages: [
|
|
||||||
'Detected ' + foundFootnotes.length + ' footnotes:',
|
|
||||||
foundFootnotes.join(', ')
|
|
||||||
]
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user