summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--components/extractor/extract_sub_info.py37
-rw-r--r--requirements.txt2
2 files changed, 39 insertions, 0 deletions
diff --git a/components/extractor/extract_sub_info.py b/components/extractor/extract_sub_info.py
new file mode 100644
index 0000000..c128fdf
--- /dev/null
+++ b/components/extractor/extract_sub_info.py
@@ -0,0 +1,37 @@
+from typing import Any, Dict, cast
+from urllib.parse import urlparse, parse_qs
+from urllib.request import urlopen
+
+from bs4 import BeautifulSoup
+from feedparser import parse # type: ignore
+
+from .check_url import is_youtube, is_playlist, is_channel
+
+def get_sub_info_from_yt_url(url: str) -> Dict[str, Any]:
+ if not is_youtube(url):
+ raise Exception(url+" is not a youtube URL.")
+ if is_playlist(url):
+ return get_feed_details(get_playlist_feed(url))
+ return get_feed_details(get_channel_feed(url))
+
+def get_playlist_feed(url: str) -> str:
+ parsed_url = urlparse(url)
+ # Extract playlist ID from query parameters
+ query_params = parse_qs(parsed_url.query)
+ playlist_id = query_params['list'][0]
+ return "https://www.youtube.com/feeds/videos.xml?playlist_id="+playlist_id
+
+def get_channel_feed(url: str, html: str = '') -> str:
+ html = html or urlopen(url).read().decode('utf-8')
+ soup = BeautifulSoup(html, 'html.parser')
+ link_obj = soup.find('link', {'title': "RSS"})
+ assert link_obj
+ return cast(str, link_obj["href"])
+
+def get_feed_details(url: str) -> Dict[str, Any]:
+ feed = parse(url).feed
+ return {
+ 'id': feed["id"],
+ 'link': feed["links"][0]["href"],
+ 'title': feed["title"],
+ }
diff --git a/requirements.txt b/requirements.txt
index 3c3ef86..56de09d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+beautifulsoup4==4.14.2
blinker==1.9.0
certifi==2025.8.3
charset-normalizer==3.4.3
@@ -23,6 +24,7 @@ pytz==2025.2
schedule==1.2.2
sentinels==1.1.1
sgmllib3k==1.0.0
+soupsieve==2.8
types-Flask-Cors==6.0.0.20250809
typing_extensions==4.14.1
Werkzeug==3.1.3