Detect Links

- Still needs a proper place since this is on `word` basis
This commit is contained in:
Johannes Zillmann 2024-04-15 08:20:18 -06:00
parent 3fa91a5d1e
commit b529dfa0a2
22 changed files with 753 additions and 0 deletions

View File

@ -0,0 +1,36 @@
{
"pages": 535,
"items": 53908,
"groupedItems": 31657,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 200,
"items": 8461,
"groupedItems": 8321,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 76,
"items": 3071,
"groupedItems": 2575,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 24,
"items": 1386,
"groupedItems": 1198,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 7,
"items": 483,
"groupedItems": 217,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 19,
"items": 1408,
"groupedItems": 1177,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 6,
"items": 268,
"groupedItems": 145,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 4,
"items": 134,
"groupedItems": 108,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 116,
"items": 7676,
"groupedItems": 3479,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 140,
"items": 25314,
"groupedItems": 3179,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 153,
"items": 14949,
"groupedItems": 10600,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 60,
"items": 3990,
"groupedItems": 1439,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 27,
"items": 1874,
"groupedItems": 1522,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 466,
"items": 772193,
"groupedItems": 15227,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 16,
"items": 1242,
"groupedItems": 416,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 4,
"items": 522,
"groupedItems": 378,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 293,
"items": 9255,
"groupedItems": 6520,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 7,
"items": 6779,
"groupedItems": 1096,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 648,
"items": 27824,
"groupedItems": 21530,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -0,0 +1,36 @@
{
"pages": 256,
"items": 20146,
"groupedItems": 7279,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "token types"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "width"
},
{
"name": "height"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
}
],
"globals": {}
}

View File

@ -20,6 +20,7 @@ import DetectBlocks from './transformer/DetectBlocks';
import DetectListLevels from './transformer/DetectListLevels';
import DetectFootnotes from './transformer/DetectFootnotes';
import DetectFontStyles from './transformer/DetectFontStyles';
import DetectLinks from './transformer/DetectLinks';
export const transformers = [
new AdjustHeight(),
@ -31,6 +32,7 @@ export const transformers = [
new RemoveRepetitiveItems(),
new DetectFootnotes(),
new DetectFontStyles(),
new DetectLinks(),
new DetectToc(),
new DetectHeaders(),
new DetectListItems(),

View File

@ -0,0 +1,31 @@
import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger';
export default class DetectLinks extends ItemTransformer {
constructor() {
super('Detect Links', 'Detect occurrences http links', {
requireColumns: ['str'],
debug: {
itemMerger: new LineItemMerger(false),
},
});
}
transform(context: TransformContext, inputItems: Item[]): ItemResult {
return {
// TODO this is missing links which are just part of an item
items: inputItems.map((item) => {
const itemText = item.data['str'];
if (itemText.startsWith('http:') || itemText.startsWith('www.')) {
// wordString = `http://${wordString}`; TODO www version
return item.withTokenTypes(['LINK']);
}
return item;
}),
messages: [`Detected ${'?'} links.`],
};
}
}