mirror of
https://github.com/muerwre/vk-tg-bot.git
synced 2025-04-24 22:46:41 +07:00
fix link extraction
This commit is contained in:
parent
f28f291ac2
commit
6ac6ca9356
3 changed files with 59 additions and 12 deletions
|
@ -18,4 +18,29 @@ describe("extractURLs", () => {
|
||||||
expect(result.length).toBe(1);
|
expect(result.length).toBe(1);
|
||||||
expect(result[0].href).toBe("https://map.vault48.org/test");
|
expect(result[0].href).toBe("https://map.vault48.org/test");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("works with that weird new VK urls without scheme", () => {
|
||||||
|
const result = extractURLs(
|
||||||
|
"Trying out links: [#alias|map.vault48.org/test|map.vault48.org/test]"
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.length).toBe(1);
|
||||||
|
expect(result[0].href).toBe("https://map.vault48.org/test");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("deduplicates matching urls", () => {
|
||||||
|
const result = extractURLs(
|
||||||
|
`Trying out links: [#alias|map.vault48.org/test|map.vault48.org/test] map.vault48.org/test https://map.vault48.org/test map.vault48.org/test2 https://map.vault48.org/test3
|
||||||
|
[#alias|map.vault48.org/test2|map.vault48.org/test2] [#alias|map.vault48.org/test3|map.vault48.org/test3] [#alias|map.vault48.org/test4|map.vault48.org/test4] https://map.vault48.org/test5
|
||||||
|
`
|
||||||
|
).map((it) => it.href);
|
||||||
|
|
||||||
|
expect(result).toEqual([
|
||||||
|
"https://map.vault48.org/test",
|
||||||
|
"https://map.vault48.org/test2",
|
||||||
|
"https://map.vault48.org/test3",
|
||||||
|
"https://map.vault48.org/test4",
|
||||||
|
"https://map.vault48.org/test5",
|
||||||
|
]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -2,21 +2,27 @@ import { transformMDLinks } from "../links";
|
||||||
|
|
||||||
describe("transformMDLinks", () => {
|
describe("transformMDLinks", () => {
|
||||||
it("extracts simple urls", () => {
|
it("extracts simple urls", () => {
|
||||||
const result = transformMDLinks(
|
expect(
|
||||||
"Trying out links https://map.vault48.org/test 123"
|
transformMDLinks("Trying out links https://map.vault48.org/test 123")
|
||||||
);
|
).toBe(
|
||||||
|
|
||||||
expect(result).toBe(
|
|
||||||
"Trying out links [https://map.vault48…](https://map.vault48.org/test) 123"
|
"Trying out links [https://map.vault48…](https://map.vault48.org/test) 123"
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("works with that weird new VK urls", () => {
|
it("works with that weird new VK urls", () => {
|
||||||
const result = transformMDLinks(
|
expect(
|
||||||
"Trying out links [#alias|12345678901234567890123|https://map.vault48.org/test_abc_def_ghi] 123"
|
transformMDLinks(
|
||||||
|
"Trying out links [#alias|12345678901234567890123|https://map.vault48.org/test_abc_def_ghi] 123"
|
||||||
|
)
|
||||||
|
).toBe(
|
||||||
|
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(result).toBe(
|
expect(
|
||||||
|
transformMDLinks(
|
||||||
|
"Trying out links [#alias|12345678901234567890123|map.vault48.org/test_abc_def_ghi] 123"
|
||||||
|
)
|
||||||
|
).toBe(
|
||||||
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
|
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,13 +1,26 @@
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
|
|
||||||
const simpleUrlRegex = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})/gim;
|
const simpleUrlRegex = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})/gim;
|
||||||
const weirdLongUrlRegex = /\[(.*)\|(.*)\|(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})\]/g;
|
|
||||||
|
/** Yep, that's how VK posts it's links */
|
||||||
|
const weirdLongUrlRegex = /\[\#alias\|([^\|]+)\|([^\]]+)\]/gim;
|
||||||
|
|
||||||
|
const fixUrl = (url: string) =>
|
||||||
|
url.startsWith("http") || !url ? url : `https://${url}`;
|
||||||
|
|
||||||
/** Extracts URLs from text */
|
/** Extracts URLs from text */
|
||||||
export const extractURLs = (text: string): URL[] => {
|
export const extractURLs = (text: string): URL[] => {
|
||||||
const matches = text.match(simpleUrlRegex) || [];
|
const urls = new Set<string>();
|
||||||
|
|
||||||
return matches
|
text
|
||||||
|
.match(weirdLongUrlRegex)
|
||||||
|
?.forEach((match) =>
|
||||||
|
urls.add(fixUrl(match.replace(weirdLongUrlRegex, "$1")))
|
||||||
|
);
|
||||||
|
|
||||||
|
text.match(simpleUrlRegex)?.forEach((match) => urls.add(match));
|
||||||
|
|
||||||
|
return Array.from(urls)
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
try {
|
try {
|
||||||
return new URL(m);
|
return new URL(m);
|
||||||
|
@ -30,7 +43,10 @@ export const transformMDLinks = (value: string) =>
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
return `[${trimTo(args[1], 20)}](${args[2]})`;
|
const title = trimTo(args[0] ?? args[1], 20);
|
||||||
|
const url = fixUrl(args[1]);
|
||||||
|
|
||||||
|
return `[${title}](${url})`;
|
||||||
})
|
})
|
||||||
.replace(simpleUrlRegex, (val) => {
|
.replace(simpleUrlRegex, (val) => {
|
||||||
if (val.endsWith(")")) {
|
if (val.endsWith(")")) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue