Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
remove links from telegram text messages
Browse files Browse the repository at this point in the history
  • Loading branch information
diicellman committed Feb 12, 2024
1 parent 92cbd57 commit 6293ee3
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion llama_hub/telegram/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Telegram reader that reads posts/chats and comments to post from Telegram channel or chat."""
import asyncio
import re
from typing import List, Union

from llama_index.readers.base import BaseReader
Expand Down Expand Up @@ -102,5 +103,15 @@ async def _load_data(
entity_name, reply_to=post_id, limit=limit
):
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=message.text))
results.append(Document(text=self._remove_links(message.text)))
return results

def _remove_links(self, string) -> str:
"""Removes all URLs from a given string, leaving only the base domain name."""

def replace_match(match):
text = match.group(1)
return text if text else ""

url_pattern = r"https?://(?:www\.)?((?!www\.).)+?"
return re.sub(url_pattern, replace_match, string)

0 comments on commit 6293ee3

Please sign in to comment.