parseHtml: add link and img extraction

This commit is contained in:
Hans Fast 2026-03-11 09:06:01 +01:00
parent e738cfae89
commit 18948cc6df
5 changed files with 298 additions and 3 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
node_modules

262
package-lock.json generated
View File

@ -1,11 +1,17 @@
{ {
"name": "ad_pubfiles_pub.d", "name": "ad_pubfiles_pub.d",
"version": "1.0.0",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "ad_pubfiles_pub.d",
"version": "1.0.0",
"license": "ISC",
"dependencies": { "dependencies": {
"@weborigami/origami": "^0.6.12" "@weborigami/origami": "^0.6.12",
"hast-util-from-html": "^2.0.3",
"unist-util-visit": "^5.1.0"
} }
}, },
"node_modules/@acemir/cssom": { "node_modules/@acemir/cssom": {
@ -685,6 +691,21 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
} }
}, },
"node_modules/@types/hast": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
"integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
"license": "MIT",
"dependencies": {
"@types/unist": "*"
}
},
"node_modules/@types/unist": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
"license": "MIT"
},
"node_modules/@weborigami/async-tree": { "node_modules/@weborigami/async-tree": {
"version": "0.6.12", "version": "0.6.12",
"resolved": "https://registry.npmjs.org/@weborigami/async-tree/-/async-tree-0.6.12.tgz", "resolved": "https://registry.npmjs.org/@weborigami/async-tree/-/async-tree-0.6.12.tgz",
@ -747,6 +768,16 @@
"require-from-string": "^2.0.2" "require-from-string": "^2.0.2"
} }
}, },
"node_modules/comma-separated-tokens": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
"integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/css-tree": { "node_modules/css-tree": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.1.0.tgz", "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.1.0.tgz",
@ -811,6 +842,15 @@
"integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/dequal": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
"integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/detect-libc": { "node_modules/detect-libc": {
"version": "2.1.2", "version": "2.1.2",
"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
@ -826,6 +866,19 @@
"integrity": "sha512-ZVyjhAJ7sCe1PNXEGveObOH9AC8QvMga3HJIghHawtG7mE4K5pW9nz/vDGAr/U7a3LWgdOzEE7ac9MURnyfaTA==", "integrity": "sha512-ZVyjhAJ7sCe1PNXEGveObOH9AC8QvMga3HJIghHawtG7mE4K5pW9nz/vDGAr/U7a3LWgdOzEE7ac9MURnyfaTA==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/devlop": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
"integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
"license": "MIT",
"dependencies": {
"dequal": "^2.0.0"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/entities": { "node_modules/entities": {
"version": "6.0.1", "version": "6.0.1",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
@ -849,6 +902,86 @@
"integrity": "sha512-IaOQ9puYtjrkq7Y0Ygl9KDZnrf/aiUJYUpVf89y8kyaxbRG7Y1SrX/jaumrv81vc61+kiMempujsM3Yw7w5qcw==", "integrity": "sha512-IaOQ9puYtjrkq7Y0Ygl9KDZnrf/aiUJYUpVf89y8kyaxbRG7Y1SrX/jaumrv81vc61+kiMempujsM3Yw7w5qcw==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/hast-util-from-html": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz",
"integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"devlop": "^1.1.0",
"hast-util-from-parse5": "^8.0.0",
"parse5": "^7.0.0",
"vfile": "^6.0.0",
"vfile-message": "^4.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-from-html/node_modules/parse5": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
"license": "MIT",
"dependencies": {
"entities": "^6.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/hast-util-from-parse5": {
"version": "8.0.3",
"resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz",
"integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"@types/unist": "^3.0.0",
"devlop": "^1.0.0",
"hastscript": "^9.0.0",
"property-information": "^7.0.0",
"vfile": "^6.0.0",
"vfile-location": "^5.0.0",
"web-namespaces": "^2.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-parse-selector": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz",
"integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hastscript": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz",
"integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"comma-separated-tokens": "^2.0.0",
"hast-util-parse-selector": "^4.0.0",
"property-information": "^7.0.0",
"space-separated-tokens": "^2.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/highlight.js": { "node_modules/highlight.js": {
"version": "11.11.1", "version": "11.11.1",
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz", "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz",
@ -1035,6 +1168,16 @@
"integrity": "sha512-B+b+kQ1YrYS7zO7P7bQcoqqMUizP06BOyNSBEnB5VJKDSWo8fsVuDkfSmwdjF0JsRtaNh83so5MMFJ95soH5jg==", "integrity": "sha512-B+b+kQ1YrYS7zO7P7bQcoqqMUizP06BOyNSBEnB5VJKDSWo8fsVuDkfSmwdjF0JsRtaNh83so5MMFJ95soH5jg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/property-information": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz",
"integrity": "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/punycode": { "node_modules/punycode": {
"version": "2.3.1", "version": "2.3.1",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
@ -1140,6 +1283,16 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/space-separated-tokens": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
"integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/stubborn-fs": { "node_modules/stubborn-fs": {
"version": "1.2.5", "version": "1.2.5",
"resolved": "https://registry.npmjs.org/stubborn-fs/-/stubborn-fs-1.2.5.tgz", "resolved": "https://registry.npmjs.org/stubborn-fs/-/stubborn-fs-1.2.5.tgz",
@ -1218,6 +1371,103 @@
"node": ">=20.18.1" "node": ">=20.18.1"
} }
}, },
"node_modules/unist-util-is": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz",
"integrity": "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/unist-util-stringify-position": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/unist-util-visit": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.1.0.tgz",
"integrity": "sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"unist-util-is": "^6.0.0",
"unist-util-visit-parents": "^6.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/unist-util-visit-parents": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.2.tgz",
"integrity": "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"unist-util-is": "^6.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/vfile": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
"integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"vfile-message": "^4.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/vfile-location": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz",
"integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"vfile": "^6.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/vfile-message": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz",
"integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"unist-util-stringify-position": "^4.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/w3c-xmlserializer": { "node_modules/w3c-xmlserializer": {
"version": "5.0.0", "version": "5.0.0",
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
@ -1240,6 +1490,16 @@
"tiny-readdir": "^2.7.2" "tiny-readdir": "^2.7.2"
} }
}, },
"node_modules/web-namespaces": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz",
"integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/webidl-conversions": { "node_modules/webidl-conversions": {
"version": "8.0.1", "version": "8.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz",

View File

@ -1,6 +1,18 @@
{ {
"dependencies": { "dependencies": {
"@weborigami/origami": "^0.6.12" "@weborigami/origami": "^0.6.12",
"hast-util-from-html": "^2.0.3",
"unist-util-visit": "^5.1.0"
}, },
"type": "module" "type": "module",
"name": "ad_pubfiles_pub.d",
"version": "1.0.0",
"description": "",
"main": "addFilenameData.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC"
} }

14
parseHtml.js Normal file
View File

@ -0,0 +1,14 @@
import {fromHtml} from 'hast-util-from-html';
import {visit} from 'unist-util-visit';
export default async function(value) {
const tree = fromHtml(value);
const links = [];
visit(tree, [{tagName: 'link'}, {tagName: 'img'}], function(node) {
links.push(node);
})
const hrefs = links.map(link => link.properties.href || link.properties.src)
return hrefs;
// return tree;
}

View File

@ -51,6 +51,14 @@
}.final }.final
renderedPages: Tree.map(pages, (page) => <page.ori>(page, all)) renderedPages: Tree.map(pages, (page) => <page.ori>(page, all))
/*
the rendered pages contain links and images.
We want to find those, and insert a tree of their maps into the output.
First test is whether that works.
Second test is whether the paths will be correct.
*/
hast = Tree.map(renderedPages, parseHtml.js)
links: Tree.flat(hast, 2) → (a) => [...new Set(a)]
/* /*
assets are relative to the pubfiles directory assets are relative to the pubfiles directory