summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA Farzat <a@farzat.xyz>2025-10-07 16:56:58 +0300
committerA Farzat <a@farzat.xyz>2025-10-08 09:48:32 +0300
commit6ca8b8991a72ec5600aa1732acd170ba5ffeb853 (patch)
tree4fd4ef3c9fbd27348a8256d14b787fee33e96466
parent53e5473b464bd1ba789b78a6a8f64d54cedfa8dd (diff)
downloadcsca5028-6ca8b8991a72ec5600aa1732acd170ba5ffeb853.tar.gz
csca5028-6ca8b8991a72ec5600aa1732acd170ba5ffeb853.zip
Add URL type identification functionality
-rw-r--r--components/extractor/check_url.py29
-rw-r--r--tests/check_url.py40
2 files changed, 69 insertions, 0 deletions
diff --git a/components/extractor/check_url.py b/components/extractor/check_url.py
new file mode 100644
index 0000000..b574b22
--- /dev/null
+++ b/components/extractor/check_url.py
@@ -0,0 +1,29 @@
+from re import search
+from urllib.parse import urlparse, parse_qs, ParseResult
+
+def is_youtube(url: str) -> bool:
+ """
+ Affirm the YouTube domain and that there is something after the domain.
+ """
+ return bool(search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', url))
+
+def is_video(url: str) -> bool:
+ if not is_youtube(url):
+ return False
+ parsed_url = urlparse(url)
+ if parsed_url.path in ('/watch', '/shorts/', '/embed/'):
+ return True
+ return parsed_url.netloc == 'youtu.be'
+
+def is_playlist(url: str) -> bool:
+ if not is_youtube(url):
+ return False
+ parsed_url = urlparse(url)
+ query_params = parse_qs(parsed_url.query)
+ return 'list' in query_params
+
+def is_channel(url: str) -> bool:
+ if not is_youtube(url):
+ return False
+ parsed_url = urlparse(url)
+ return parsed_url.path.startswith(('/c/', '/user/', '/channel/', '/@'))
diff --git a/tests/check_url.py b/tests/check_url.py
new file mode 100644
index 0000000..ae29c53
--- /dev/null
+++ b/tests/check_url.py
@@ -0,0 +1,40 @@
+from unittest import TestCase
+
+from components.extractor.check_url import is_youtube, is_channel, is_playlist, is_video
+
+class Test_URL_Checker(TestCase):
+ def test_youtube_detection(self) -> None:
+ self.assertTrue(is_youtube("https://www.youtube.com/watch?v=dQw4w9WgXcQ"))
+ self.assertFalse(is_youtube("https://archive.org/"))
+ self.assertTrue(is_youtube("https://www.youtube.com/c/3blue1brown"))
+ self.assertFalse(is_youtube("https://www.nasa.gov/"))
+ self.assertTrue(is_youtube("https://youtu.be/jNQXAC9IVRw"))
+ self.assertFalse(is_youtube("https://www.wikipedia.org/"))
+ self.assertFalse(is_youtube("https://www.youtube.com/")) # Nothing after the domain.
+ self.assertFalse(is_youtube("https://xkcd.com/"))
+ self.assertTrue(is_youtube("https://www.youtube.com/@kurzgesagt"))
+ self.assertFalse(is_youtube("https://www.gutenberg.org/"))
+
+ def test_channel_detection(self) -> None:
+ self.assertTrue(is_channel("https://www.youtube.com/@LexFridman"))
+ self.assertTrue(is_channel("https://www.youtube.com/@PrimitiveTechnology"))
+ self.assertTrue(is_channel("https://www.youtube.com/user/schafer5"))
+ self.assertTrue(is_channel("https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A"))
+ self.assertTrue(is_channel("https://www.youtube.com/c/mkbhd"))
+ self.assertTrue(is_channel("https://www.youtube.com/@MentalOutlaw/videos"))
+ self.assertFalse(is_channel("https://youtu.be/jNQXAC9IVRw"))
+ self.assertFalse(is_channel("https://www.youtube.com/watch?v=dQw4w9WgXcQ"))
+ self.assertFalse(is_channel("https://www.youtube.com/playlist?list=PL3cu45aM3C2CADmCYeVhS4KTVut9MoMc9"))
+
+ def test_playlist_detection(self) -> None:
+ self.assertTrue(is_playlist("https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr"))
+ self.assertTrue(is_playlist("https://www.youtube.com/watch?v=YykjpeuMNEk&list=PLirAqAtl_h2r5g8xGajEwdXd3x1sZh8hC&index=1&t=245s"))
+ self.assertFalse(is_playlist("https://www.youtube.com/@LexFridman"))
+ self.assertFalse(is_playlist("https://www.youtube.com/watch?v=dQw4w9WgXcQ"))
+
+ def test_video_detection(self) -> None:
+ self.assertTrue(is_video("https://youtu.be/G8iEMVr7GFg?t=112"))
+ self.assertTrue(is_video("https://www.youtube.com/watch?v=dQw4w9WgXcQ"))
+ self.assertTrue(is_video("https://youtu.be/jNQXAC9IVRw"))
+ self.assertFalse(is_video("https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A"))
+ self.assertFalse(is_video("https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr"))