diff --git a/examples/Achieving-The-Paris-Climate-Agreement/detectLinks.json b/examples/Achieving-The-Paris-Climate-Agreement/detectLinks.json new file mode 100644 index 0000000..2b28038 --- /dev/null +++ b/examples/Achieving-The-Paris-Climate-Agreement/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 535, + "items": 53908, + "groupedItems": 31657, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Adventures-Of-Sherlock-Holmes/detectLinks.json b/examples/Adventures-Of-Sherlock-Holmes/detectLinks.json new file mode 100644 index 0000000..819a775 --- /dev/null +++ b/examples/Adventures-Of-Sherlock-Holmes/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 200, + "items": 8461, + "groupedItems": 8321, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Alice-In-Wonderland/detectLinks.json b/examples/Alice-In-Wonderland/detectLinks.json new file mode 100644 index 0000000..2a0694a --- /dev/null +++ b/examples/Alice-In-Wonderland/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 76, + "items": 3071, + "groupedItems": 2575, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/CC-NC_Leitfaden/detectLinks.json b/examples/CC-NC_Leitfaden/detectLinks.json new file mode 100644 index 0000000..5a5c154 --- /dev/null +++ b/examples/CC-NC_Leitfaden/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 24, + "items": 1386, + "groupedItems": 1198, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/CC_License_Agreement_of_siMPle/detectLinks.json b/examples/CC_License_Agreement_of_siMPle/detectLinks.json new file mode 100644 index 0000000..c62ab39 --- /dev/null +++ b/examples/CC_License_Agreement_of_siMPle/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 7, + "items": 483, + "groupedItems": 217, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Closed-Syllables/detectLinks.json b/examples/Closed-Syllables/detectLinks.json new file mode 100644 index 0000000..54ba4c8 --- /dev/null +++ b/examples/Closed-Syllables/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 19, + "items": 1408, + "groupedItems": 1177, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/ExamplePdf/detectLinks.json b/examples/ExamplePdf/detectLinks.json new file mode 100644 index 0000000..3626453 --- /dev/null +++ b/examples/ExamplePdf/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 6, + "items": 268, + "groupedItems": 145, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Flash-Masques-Temperature/detectLinks.json b/examples/Flash-Masques-Temperature/detectLinks.json new file mode 100644 index 0000000..45c5ba1 --- /dev/null +++ b/examples/Flash-Masques-Temperature/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 4, + "items": 134, + "groupedItems": 108, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Grammar-Matters/detectLinks.json b/examples/Grammar-Matters/detectLinks.json new file mode 100644 index 0000000..5819256 --- /dev/null +++ b/examples/Grammar-Matters/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 116, + "items": 7676, + "groupedItems": 3479, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Life-Of-God-In-Soul-Of-Man/detectLinks.json b/examples/Life-Of-God-In-Soul-Of-Man/detectLinks.json new file mode 100644 index 0000000..04b69cc --- /dev/null +++ b/examples/Life-Of-God-In-Soul-Of-Man/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 140, + "items": 25314, + "groupedItems": 3179, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Made-with-cc/detectLinks.json b/examples/Made-with-cc/detectLinks.json new file mode 100644 index 0000000..ef95d9b --- /dev/null +++ b/examples/Made-with-cc/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 153, + "items": 14949, + "groupedItems": 10600, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Safe-Communication/detectLinks.json b/examples/Safe-Communication/detectLinks.json new file mode 100644 index 0000000..fe99fb1 --- /dev/null +++ b/examples/Safe-Communication/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 60, + "items": 3990, + "groupedItems": 1439, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/St-Mary-Witney-Social-Audit/detectLinks.json b/examples/St-Mary-Witney-Social-Audit/detectLinks.json new file mode 100644 index 0000000..5083913 --- /dev/null +++ b/examples/St-Mary-Witney-Social-Audit/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 27, + "items": 1874, + "groupedItems": 1522, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Art-of-Public-Speaking/detectLinks.json b/examples/The-Art-of-Public-Speaking/detectLinks.json new file mode 100644 index 0000000..e0b58b4 --- /dev/null +++ b/examples/The-Art-of-Public-Speaking/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 466, + "items": 772193, + "groupedItems": 15227, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectLinks.json b/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectLinks.json new file mode 100644 index 0000000..7dcb1b6 --- /dev/null +++ b/examples/The-Impact-of-Open-Access-Latin-American-Scholarship/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 16, + "items": 1242, + "groupedItems": 416, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-Man-Without-A-Body/detectLinks.json b/examples/The-Man-Without-A-Body/detectLinks.json new file mode 100644 index 0000000..d3444dc --- /dev/null +++ b/examples/The-Man-Without-A-Body/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 4, + "items": 522, + "groupedItems": 378, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/The-War-of-the-Worlds/detectLinks.json b/examples/The-War-of-the-Worlds/detectLinks.json new file mode 100644 index 0000000..50fca3e --- /dev/null +++ b/examples/The-War-of-the-Worlds/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 293, + "items": 9255, + "groupedItems": 6520, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Tragedy-Of-The-Commons/detectLinks.json b/examples/Tragedy-Of-The-Commons/detectLinks.json new file mode 100644 index 0000000..0c83a15 --- /dev/null +++ b/examples/Tragedy-Of-The-Commons/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 7, + "items": 6779, + "groupedItems": 1096, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/Watered-Soul-Blog-Book/detectLinks.json b/examples/Watered-Soul-Blog-Book/detectLinks.json new file mode 100644 index 0000000..1c7b0c3 --- /dev/null +++ b/examples/Watered-Soul-Blog-Book/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 648, + "items": 27824, + "groupedItems": 21530, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/examples/WoodUp/detectLinks.json b/examples/WoodUp/detectLinks.json new file mode 100644 index 0000000..1c1f4b3 --- /dev/null +++ b/examples/WoodUp/detectLinks.json @@ -0,0 +1,36 @@ +{ + "pages": 256, + "items": 20146, + "groupedItems": 7279, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "token types" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "width" + }, + { + "name": "height" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + } + ], + "globals": {} +} \ No newline at end of file diff --git a/src/index.ts b/src/index.ts index bf50573..6c54c1c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -20,6 +20,7 @@ import DetectBlocks from './transformer/DetectBlocks'; import DetectListLevels from './transformer/DetectListLevels'; import DetectFootnotes from './transformer/DetectFootnotes'; import DetectFontStyles from './transformer/DetectFontStyles'; +import DetectLinks from './transformer/DetectLinks'; export const transformers = [ new AdjustHeight(), @@ -31,6 +32,7 @@ export const transformers = [ new RemoveRepetitiveItems(), new DetectFootnotes(), new DetectFontStyles(), + new DetectLinks(), new DetectToc(), new DetectHeaders(), new DetectListItems(), diff --git a/src/transformer/DetectLinks.ts b/src/transformer/DetectLinks.ts new file mode 100644 index 0000000..2af5a3e --- /dev/null +++ b/src/transformer/DetectLinks.ts @@ -0,0 +1,31 @@ +import Item from '../Item'; +import ItemResult from '../ItemResult'; +import ItemTransformer from './ItemTransformer'; +import TransformContext from './TransformContext'; +import LineItemMerger from '../debug/LineItemMerger'; + +export default class DetectLinks extends ItemTransformer { + constructor() { + super('Detect Links', 'Detect occurrences http links', { + requireColumns: ['str'], + debug: { + itemMerger: new LineItemMerger(false), + }, + }); + } + + transform(context: TransformContext, inputItems: Item[]): ItemResult { + return { + // TODO this is missing links which are just part of an item + items: inputItems.map((item) => { + const itemText = item.data['str']; + if (itemText.startsWith('http:') || itemText.startsWith('www.')) { + // wordString = `http://${wordString}`; TODO www version + return item.withTokenTypes(['LINK']); + } + return item; + }), + messages: [`Detected ${'?'} links.`], + }; + } +}