mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-27 18:23:36 +01:00
DetectTOC: only go for lines containing '...' words
DetectTOC will work on all lines, shaving off numbers from the last of the words in a given line, so long as the word is not all full-stops. This implies that a TOC line is one that contains strings containing only full-stops, and so, DetectTOC should only work on such lines. This change will remove unwanted behaviour where DetectTOC removes trailing numbers that we actually want to keep in lines, eg: Case Number : ABC 12/1234
This commit is contained in:
parent
2869b5e5de
commit
648c0add59
@ -30,8 +30,10 @@ export default class DetectTOC extends ToLineItemTransformation {
|
||||
const pageTocLinks = [];
|
||||
var lastWordsWithoutNumber;
|
||||
var lastLine;
|
||||
//find lines ending with a number per page
|
||||
page.items.forEach(line => {
|
||||
// find lines with words containing only "." ...
|
||||
const tocLines = page.items.filter(line => line.words.includes(word => hasOnly(word.string, '.')))
|
||||
// ... and ending with a number per page
|
||||
tocLines.forEach(line => {
|
||||
var words = line.words.filter(word => !hasOnly(word.string, '.'));
|
||||
const digits = [];
|
||||
while (words.length > 0 && isNumber(words[words.length - 1].string)) {
|
||||
|
Loading…
Reference in New Issue
Block a user