summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA Farzat <a@farzat.xyz>2025-10-08 09:45:15 +0300
committerA Farzat <a@farzat.xyz>2025-10-08 10:12:30 +0300
commita79fe26ade3a46ddf5649b2318e33ea95e15e0fa (patch)
tree5aefc9376b3fe05ffbd52b80fd1e568b647ef9c5
parent5b565655b1e9bbfbe6e142266ba6c5290d0afa28 (diff)
downloadcsca5028-a79fe26ade3a46ddf5649b2318e33ea95e15e0fa.tar.gz
csca5028-a79fe26ade3a46ddf5649b2318e33ea95e15e0fa.zip
Add a module to get vid info through scraping
-rw-r--r--components/extractor/obtain_vid_info.py13
-rw-r--r--requirements.txt1
2 files changed, 14 insertions, 0 deletions
diff --git a/components/extractor/obtain_vid_info.py b/components/extractor/obtain_vid_info.py
new file mode 100644
index 0000000..bfbe861
--- /dev/null
+++ b/components/extractor/obtain_vid_info.py
@@ -0,0 +1,13 @@
+from urllib.request import urlopen
+
+from bs4 import BeautifulSoup
+from isodate import parse_duration # type: ignore
+
+def obtain_vid_duration(url: str, html: str = '') -> int:
+ html = html or urlopen(url).read().decode('utf-8')
+ soup = BeautifulSoup(html, 'html.parser')
+
+ duration_meta = soup.find('meta', itemprop='duration')
+ assert duration_meta
+ duration = parse_duration(duration_meta['content'])
+ return int(duration.total_seconds())
diff --git a/requirements.txt b/requirements.txt
index 56de09d..53fab07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ Flask==3.1.2
flask-cors==6.0.1
gunicorn==23.0.0
idna==3.10
+isodate==0.7.2
itsdangerous==2.2.0
Jinja2==3.1.6
MarkupSafe==3.0.2