From 6ca8b8991a72ec5600aa1732acd170ba5ffeb853 Mon Sep 17 00:00:00 2001 From: A Farzat Date: Tue, 7 Oct 2025 16:56:58 +0300 Subject: Add URL type identification functionality --- components/extractor/check_url.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 components/extractor/check_url.py (limited to 'components/extractor') diff --git a/components/extractor/check_url.py b/components/extractor/check_url.py new file mode 100644 index 0000000..b574b22 --- /dev/null +++ b/components/extractor/check_url.py @@ -0,0 +1,29 @@ +from re import search +from urllib.parse import urlparse, parse_qs, ParseResult + +def is_youtube(url: str) -> bool: + """ + Affirm the YouTube domain and that there is something after the domain. + """ + return bool(search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', url)) + +def is_video(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + if parsed_url.path in ('/watch', '/shorts/', '/embed/'): + return True + return parsed_url.netloc == 'youtu.be' + +def is_playlist(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + return 'list' in query_params + +def is_channel(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + return parsed_url.path.startswith(('/c/', '/user/', '/channel/', '/@')) -- cgit v1.2.3-70-g09d2