From 7595389a5fce42ed7d663a246c9597fa3545d742 Mon Sep 17 00:00:00 2001 From: A Farzat Date: Wed, 8 Oct 2025 09:34:42 +0300 Subject: Add a module to obtain Subscription info from URLs --- components/extractor/extract_sub_info.py | 37 ++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 components/extractor/extract_sub_info.py (limited to 'components/extractor/extract_sub_info.py') diff --git a/components/extractor/extract_sub_info.py b/components/extractor/extract_sub_info.py new file mode 100644 index 0000000..c128fdf --- /dev/null +++ b/components/extractor/extract_sub_info.py @@ -0,0 +1,37 @@ +from typing import Any, Dict, cast +from urllib.parse import urlparse, parse_qs +from urllib.request import urlopen + +from bs4 import BeautifulSoup +from feedparser import parse # type: ignore + +from .check_url import is_youtube, is_playlist, is_channel + +def get_sub_info_from_yt_url(url: str) -> Dict[str, Any]: + if not is_youtube(url): + raise Exception(url+" is not a youtube URL.") + if is_playlist(url): + return get_feed_details(get_playlist_feed(url)) + return get_feed_details(get_channel_feed(url)) + +def get_playlist_feed(url: str) -> str: + parsed_url = urlparse(url) + # Extract playlist ID from query parameters + query_params = parse_qs(parsed_url.query) + playlist_id = query_params['list'][0] + return "https://www.youtube.com/feeds/videos.xml?playlist_id="+playlist_id + +def get_channel_feed(url: str, html: str = '') -> str: + html = html or urlopen(url).read().decode('utf-8') + soup = BeautifulSoup(html, 'html.parser') + link_obj = soup.find('link', {'title': "RSS"}) + assert link_obj + return cast(str, link_obj["href"]) + +def get_feed_details(url: str) -> Dict[str, Any]: + feed = parse(url).feed + return { + 'id': feed["id"], + 'link': feed["links"][0]["href"], + 'title': feed["title"], + } -- cgit v1.2.3-70-g09d2