diff options
| author | A Farzat <a@farzat.xyz> | 2025-10-08 09:45:15 +0300 |
|---|---|---|
| committer | A Farzat <a@farzat.xyz> | 2025-10-08 10:12:30 +0300 |
| commit | a79fe26ade3a46ddf5649b2318e33ea95e15e0fa (patch) | |
| tree | 5aefc9376b3fe05ffbd52b80fd1e568b647ef9c5 | |
| parent | 5b565655b1e9bbfbe6e142266ba6c5290d0afa28 (diff) | |
| download | csca5028-a79fe26ade3a46ddf5649b2318e33ea95e15e0fa.tar.gz csca5028-a79fe26ade3a46ddf5649b2318e33ea95e15e0fa.zip | |
Add a module to get vid info through scraping
| -rw-r--r-- | components/extractor/obtain_vid_info.py | 13 | ||||
| -rw-r--r-- | requirements.txt | 1 |
2 files changed, 14 insertions, 0 deletions
diff --git a/components/extractor/obtain_vid_info.py b/components/extractor/obtain_vid_info.py new file mode 100644 index 0000000..bfbe861 --- /dev/null +++ b/components/extractor/obtain_vid_info.py @@ -0,0 +1,13 @@ +from urllib.request import urlopen + +from bs4 import BeautifulSoup +from isodate import parse_duration # type: ignore + +def obtain_vid_duration(url: str, html: str = '') -> int: + html = html or urlopen(url).read().decode('utf-8') + soup = BeautifulSoup(html, 'html.parser') + + duration_meta = soup.find('meta', itemprop='duration') + assert duration_meta + duration = parse_duration(duration_meta['content']) + return int(duration.total_seconds()) diff --git a/requirements.txt b/requirements.txt index 56de09d..53fab07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ Flask==3.1.2 flask-cors==6.0.1 gunicorn==23.0.0 idna==3.10 +isodate==0.7.2 itsdangerous==2.2.0 Jinja2==3.1.6 MarkupSafe==3.0.2 |
