mirror of
https://github.com/muerwre/vk-tg-bot.git
synced 2025-04-24 22:46:41 +07:00
fix link extraction
This commit is contained in:
parent
f28f291ac2
commit
6ac6ca9356
3 changed files with 59 additions and 12 deletions
|
@ -18,4 +18,29 @@ describe("extractURLs", () => {
|
|||
expect(result.length).toBe(1);
|
||||
expect(result[0].href).toBe("https://map.vault48.org/test");
|
||||
});
|
||||
|
||||
it("works with that weird new VK urls without scheme", () => {
|
||||
const result = extractURLs(
|
||||
"Trying out links: [#alias|map.vault48.org/test|map.vault48.org/test]"
|
||||
);
|
||||
|
||||
expect(result.length).toBe(1);
|
||||
expect(result[0].href).toBe("https://map.vault48.org/test");
|
||||
});
|
||||
|
||||
it("deduplicates matching urls", () => {
|
||||
const result = extractURLs(
|
||||
`Trying out links: [#alias|map.vault48.org/test|map.vault48.org/test] map.vault48.org/test https://map.vault48.org/test map.vault48.org/test2 https://map.vault48.org/test3
|
||||
[#alias|map.vault48.org/test2|map.vault48.org/test2] [#alias|map.vault48.org/test3|map.vault48.org/test3] [#alias|map.vault48.org/test4|map.vault48.org/test4] https://map.vault48.org/test5
|
||||
`
|
||||
).map((it) => it.href);
|
||||
|
||||
expect(result).toEqual([
|
||||
"https://map.vault48.org/test",
|
||||
"https://map.vault48.org/test2",
|
||||
"https://map.vault48.org/test3",
|
||||
"https://map.vault48.org/test4",
|
||||
"https://map.vault48.org/test5",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -2,21 +2,27 @@ import { transformMDLinks } from "../links";
|
|||
|
||||
describe("transformMDLinks", () => {
|
||||
it("extracts simple urls", () => {
|
||||
const result = transformMDLinks(
|
||||
"Trying out links https://map.vault48.org/test 123"
|
||||
);
|
||||
|
||||
expect(result).toBe(
|
||||
expect(
|
||||
transformMDLinks("Trying out links https://map.vault48.org/test 123")
|
||||
).toBe(
|
||||
"Trying out links [https://map.vault48…](https://map.vault48.org/test) 123"
|
||||
);
|
||||
});
|
||||
|
||||
it("works with that weird new VK urls", () => {
|
||||
const result = transformMDLinks(
|
||||
"Trying out links [#alias|12345678901234567890123|https://map.vault48.org/test_abc_def_ghi] 123"
|
||||
expect(
|
||||
transformMDLinks(
|
||||
"Trying out links [#alias|12345678901234567890123|https://map.vault48.org/test_abc_def_ghi] 123"
|
||||
)
|
||||
).toBe(
|
||||
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
|
||||
);
|
||||
|
||||
expect(result).toBe(
|
||||
expect(
|
||||
transformMDLinks(
|
||||
"Trying out links [#alias|12345678901234567890123|map.vault48.org/test_abc_def_ghi] 123"
|
||||
)
|
||||
).toBe(
|
||||
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
|
||||
);
|
||||
});
|
||||
|
|
|
@ -1,13 +1,26 @@
|
|||
import { URL } from "url";
|
||||
|
||||
const simpleUrlRegex = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})/gim;
|
||||
const weirdLongUrlRegex = /\[(.*)\|(.*)\|(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})\]/g;
|
||||
|
||||
/** Yep, that's how VK posts it's links */
|
||||
const weirdLongUrlRegex = /\[\#alias\|([^\|]+)\|([^\]]+)\]/gim;
|
||||
|
||||
const fixUrl = (url: string) =>
|
||||
url.startsWith("http") || !url ? url : `https://${url}`;
|
||||
|
||||
/** Extracts URLs from text */
|
||||
export const extractURLs = (text: string): URL[] => {
|
||||
const matches = text.match(simpleUrlRegex) || [];
|
||||
const urls = new Set<string>();
|
||||
|
||||
return matches
|
||||
text
|
||||
.match(weirdLongUrlRegex)
|
||||
?.forEach((match) =>
|
||||
urls.add(fixUrl(match.replace(weirdLongUrlRegex, "$1")))
|
||||
);
|
||||
|
||||
text.match(simpleUrlRegex)?.forEach((match) => urls.add(match));
|
||||
|
||||
return Array.from(urls)
|
||||
.map((m) => {
|
||||
try {
|
||||
return new URL(m);
|
||||
|
@ -30,7 +43,10 @@ export const transformMDLinks = (value: string) =>
|
|||
return val;
|
||||
}
|
||||
|
||||
return `[${trimTo(args[1], 20)}](${args[2]})`;
|
||||
const title = trimTo(args[0] ?? args[1], 20);
|
||||
const url = fixUrl(args[1]);
|
||||
|
||||
return `[${title}](${url})`;
|
||||
})
|
||||
.replace(simpleUrlRegex, (val) => {
|
||||
if (val.endsWith(")")) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue