From 7595389a5fce42ed7d663a246c9597fa3545d742 Mon Sep 17 00:00:00 2001 From: A Farzat Date: Wed, 8 Oct 2025 09:34:42 +0300 Subject: Add a module to obtain Subscription info from URLs --- components/extractor/extract_sub_info.py | 37 ++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 2 files changed, 39 insertions(+) create mode 100644 components/extractor/extract_sub_info.py diff --git a/components/extractor/extract_sub_info.py b/components/extractor/extract_sub_info.py new file mode 100644 index 0000000..c128fdf --- /dev/null +++ b/components/extractor/extract_sub_info.py @@ -0,0 +1,37 @@ +from typing import Any, Dict, cast +from urllib.parse import urlparse, parse_qs +from urllib.request import urlopen + +from bs4 import BeautifulSoup +from feedparser import parse # type: ignore + +from .check_url import is_youtube, is_playlist, is_channel + +def get_sub_info_from_yt_url(url: str) -> Dict[str, Any]: + if not is_youtube(url): + raise Exception(url+" is not a youtube URL.") + if is_playlist(url): + return get_feed_details(get_playlist_feed(url)) + return get_feed_details(get_channel_feed(url)) + +def get_playlist_feed(url: str) -> str: + parsed_url = urlparse(url) + # Extract playlist ID from query parameters + query_params = parse_qs(parsed_url.query) + playlist_id = query_params['list'][0] + return "https://www.youtube.com/feeds/videos.xml?playlist_id="+playlist_id + +def get_channel_feed(url: str, html: str = '') -> str: + html = html or urlopen(url).read().decode('utf-8') + soup = BeautifulSoup(html, 'html.parser') + link_obj = soup.find('link', {'title': "RSS"}) + assert link_obj + return cast(str, link_obj["href"]) + +def get_feed_details(url: str) -> Dict[str, Any]: + feed = parse(url).feed + return { + 'id': feed["id"], + 'link': feed["links"][0]["href"], + 'title': feed["title"], + } diff --git a/requirements.txt b/requirements.txt index 3c3ef86..56de09d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +beautifulsoup4==4.14.2 blinker==1.9.0 certifi==2025.8.3 charset-normalizer==3.4.3 @@ -23,6 +24,7 @@ pytz==2025.2 schedule==1.2.2 sentinels==1.1.1 sgmllib3k==1.0.0 +soupsieve==2.8 types-Flask-Cors==6.0.0.20250809 typing_extensions==4.14.1 Werkzeug==3.1.3 -- cgit v1.2.3-70-g09d2