diff options
| author | A Farzat <a@farzat.xyz> | 2025-10-07 16:56:58 +0300 |
|---|---|---|
| committer | A Farzat <a@farzat.xyz> | 2025-10-08 09:48:32 +0300 |
| commit | 6ca8b8991a72ec5600aa1732acd170ba5ffeb853 (patch) | |
| tree | 4fd4ef3c9fbd27348a8256d14b787fee33e96466 | |
| parent | 53e5473b464bd1ba789b78a6a8f64d54cedfa8dd (diff) | |
| download | csca5028-6ca8b8991a72ec5600aa1732acd170ba5ffeb853.tar.gz csca5028-6ca8b8991a72ec5600aa1732acd170ba5ffeb853.zip | |
Add URL type identification functionality
| -rw-r--r-- | components/extractor/check_url.py | 29 | ||||
| -rw-r--r-- | tests/check_url.py | 40 |
2 files changed, 69 insertions, 0 deletions
diff --git a/components/extractor/check_url.py b/components/extractor/check_url.py new file mode 100644 index 0000000..b574b22 --- /dev/null +++ b/components/extractor/check_url.py @@ -0,0 +1,29 @@ +from re import search +from urllib.parse import urlparse, parse_qs, ParseResult + +def is_youtube(url: str) -> bool: + """ + Affirm the YouTube domain and that there is something after the domain. + """ + return bool(search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', url)) + +def is_video(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + if parsed_url.path in ('/watch', '/shorts/', '/embed/'): + return True + return parsed_url.netloc == 'youtu.be' + +def is_playlist(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + return 'list' in query_params + +def is_channel(url: str) -> bool: + if not is_youtube(url): + return False + parsed_url = urlparse(url) + return parsed_url.path.startswith(('/c/', '/user/', '/channel/', '/@')) diff --git a/tests/check_url.py b/tests/check_url.py new file mode 100644 index 0000000..ae29c53 --- /dev/null +++ b/tests/check_url.py @@ -0,0 +1,40 @@ +from unittest import TestCase + +from components.extractor.check_url import is_youtube, is_channel, is_playlist, is_video + +class Test_URL_Checker(TestCase): + def test_youtube_detection(self) -> None: + self.assertTrue(is_youtube("https://www.youtube.com/watch?v=dQw4w9WgXcQ")) + self.assertFalse(is_youtube("https://archive.org/")) + self.assertTrue(is_youtube("https://www.youtube.com/c/3blue1brown")) + self.assertFalse(is_youtube("https://www.nasa.gov/")) + self.assertTrue(is_youtube("https://youtu.be/jNQXAC9IVRw")) + self.assertFalse(is_youtube("https://www.wikipedia.org/")) + self.assertFalse(is_youtube("https://www.youtube.com/")) # Nothing after the domain. + self.assertFalse(is_youtube("https://xkcd.com/")) + self.assertTrue(is_youtube("https://www.youtube.com/@kurzgesagt")) + self.assertFalse(is_youtube("https://www.gutenberg.org/")) + + def test_channel_detection(self) -> None: + self.assertTrue(is_channel("https://www.youtube.com/@LexFridman")) + self.assertTrue(is_channel("https://www.youtube.com/@PrimitiveTechnology")) + self.assertTrue(is_channel("https://www.youtube.com/user/schafer5")) + self.assertTrue(is_channel("https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A")) + self.assertTrue(is_channel("https://www.youtube.com/c/mkbhd")) + self.assertTrue(is_channel("https://www.youtube.com/@MentalOutlaw/videos")) + self.assertFalse(is_channel("https://youtu.be/jNQXAC9IVRw")) + self.assertFalse(is_channel("https://www.youtube.com/watch?v=dQw4w9WgXcQ")) + self.assertFalse(is_channel("https://www.youtube.com/playlist?list=PL3cu45aM3C2CADmCYeVhS4KTVut9MoMc9")) + + def test_playlist_detection(self) -> None: + self.assertTrue(is_playlist("https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr")) + self.assertTrue(is_playlist("https://www.youtube.com/watch?v=YykjpeuMNEk&list=PLirAqAtl_h2r5g8xGajEwdXd3x1sZh8hC&index=1&t=245s")) + self.assertFalse(is_playlist("https://www.youtube.com/@LexFridman")) + self.assertFalse(is_playlist("https://www.youtube.com/watch?v=dQw4w9WgXcQ")) + + def test_video_detection(self) -> None: + self.assertTrue(is_video("https://youtu.be/G8iEMVr7GFg?t=112")) + self.assertTrue(is_video("https://www.youtube.com/watch?v=dQw4w9WgXcQ")) + self.assertTrue(is_video("https://youtu.be/jNQXAC9IVRw")) + self.assertFalse(is_video("https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A")) + self.assertFalse(is_video("https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr")) |
