1
0
Fork 0
mirror of https://github.com/muerwre/vk-tg-bot.git synced 2025-04-24 22:46:41 +07:00

fix links parsing

This commit is contained in:
Fedor Katurov 2025-02-24 20:30:24 +07:00
parent 2131447939
commit f28f291ac2
8 changed files with 68 additions and 33 deletions

View file

@ -12,11 +12,9 @@ RUN yarn build
FROM node:18-bookworm AS runner
COPY --from=builder /app/dist ./
WORKDIR /app/dist
COPY --from=builder /app/dist ./
COPY ./docker/wait-for-it.sh .
EXPOSE 80

View file

@ -9,7 +9,7 @@ import path from "path";
import hb from "handlebars";
import strip from "strip-markdown";
import { VFileCompatible } from "vfile";
import transformMDLinks from "../../utils/transformMDLinks";
import { transformMDLinks } from "../../utils/links";
const removeFrontmatter = () => (tree) => {
tree.children = tree.children.filter((item) => item.type !== "yaml");

View file

@ -11,7 +11,7 @@ import {
User,
} from "typegram";
import { keys } from "lodash";
import { extractURLs } from "../../../utils/extract";
import { extractURLs } from "../../../utils/links";
import logger from "../../logger";
import Composer from "telegraf";
import { Template } from "../../template";

View file

@ -1,4 +1,4 @@
import { extractURLs } from "../extract";
import { extractURLs } from "../links";
describe("extractURLs", () => {
it("extracts simple urls", () => {

View file

@ -0,0 +1,23 @@
import { transformMDLinks } from "../links";
describe("transformMDLinks", () => {
it("extracts simple urls", () => {
const result = transformMDLinks(
"Trying out links https://map.vault48.org/test 123"
);
expect(result).toBe(
"Trying out links [https://map.vault48…](https://map.vault48.org/test) 123"
);
});
it("works with that weird new VK urls", () => {
const result = transformMDLinks(
"Trying out links [#alias|12345678901234567890123|https://map.vault48.org/test_abc_def_ghi] 123"
);
expect(result).toBe(
"Trying out links [1234567890123456789…](https://map.vault48.org/test_abc_def_ghi) 123"
);
});
});

View file

@ -1,17 +0,0 @@
import { URL } from "url";
const urlRe = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})/gim;
export const extractURLs = (text: string): URL[] => {
const matches = text.match(urlRe) || [];
return matches
.map((m) => {
try {
return new URL(m);
} catch (e) {
return;
}
})
.filter((el) => el) as URL[];
};

41
src/utils/links.ts Normal file
View file

@ -0,0 +1,41 @@
import { URL } from "url";
const simpleUrlRegex = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})/gim;
const weirdLongUrlRegex = /\[(.*)\|(.*)\|(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s\]]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s\]]{2,}|www\.[a-zA-Z0-9]+\.[^\s\]]{2,})\]/g;
/** Extracts URLs from text */
export const extractURLs = (text: string): URL[] => {
const matches = text.match(simpleUrlRegex) || [];
return matches
.map((m) => {
try {
return new URL(m);
} catch (e) {
return;
}
})
.filter((el) => el) as URL[];
};
/** Adds ... to text if its length exceeds maxLength */
const trimTo = (val: string, maxLength: number) =>
val.length > maxLength ? val.substring(0, maxLength - 1).concat("…") : val;
/** Formatting all links in markdown output, trimming them to reasonable length */
export const transformMDLinks = (value: string) =>
value
.replace(weirdLongUrlRegex, (val, ...args) => {
if (args.length < 2) {
return val;
}
return `[${trimTo(args[1], 20)}](${args[2]})`;
})
.replace(simpleUrlRegex, (val) => {
if (val.endsWith(")")) {
return val;
}
return `[${trimTo(val, 20)}](${val})`;
});

View file

@ -1,10 +0,0 @@
const urlRegex = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})/g;
const trimTo = (val: string, maxLength: number) =>
val.length > maxLength ? val.substring(0, maxLength - 1).concat("…") : val;
/** Formatting all links in markdown output, trimming them to reasonable length */
export default (value: string) =>
value.replace(urlRegex, (val) => {
return `[${trimTo(val, 20)}](${val})`;
});