diff options
Diffstat (limited to 'data-collection/components')
| -rw-r--r-- | data-collection/components/subscriptions/main.py | 58 | ||||
| -rw-r--r-- | data-collection/components/subscriptions/typing.py | 7 | ||||
| -rw-r--r-- | data-collection/components/videos.py | 14 |
3 files changed, 57 insertions, 22 deletions
diff --git a/data-collection/components/subscriptions/main.py b/data-collection/components/subscriptions/main.py index 352cf1b..6a3f5cd 100644 --- a/data-collection/components/subscriptions/main.py +++ b/data-collection/components/subscriptions/main.py @@ -1,36 +1,58 @@ -from dataclasses import dataclass, field -from datetime import datetime +from dataclasses import dataclass, field, asdict +from datetime import datetime, UTC from sys import stderr from typing import TypedDict, List from bson.objectid import ObjectId from feedparser import parse # type: ignore -import requests +from pymongo.collection import Collection +from pymongo.results import UpdateResult import schedule +from components.database import subscriptions +from components.subscriptions.typing import SubsDict from components.videos import VideoTuple @dataclass class Subscription: - id: str + _id: str link: str time_between_fetches: int - last_update: datetime = datetime.min + last_fetch: datetime = datetime.min.replace(tzinfo=UTC) + last_video_update: datetime = datetime.min.replace(tzinfo=UTC) videos: List[VideoTuple] = field(default_factory=list) subscribers: List[ObjectId] = field(default_factory=list) def __post_init__(self) -> None: - self._job: schedule.Job = schedule.every(self.time_between_fetches).second.do(self.update) + self._job: schedule.Job = schedule.every(self.time_between_fetches).minutes.do(self.fetch) + self._collection: Collection[SubsDict] = subscriptions + self._in_db: bool = False - def update(self) -> None: + def fetch(self) -> None: try: - for entry in parse(self.link)["entries"]: - self.videos.append(VideoTuple( - id = entry.id, - link = entry.link, - title = entry.title, - published = datetime.fromisoformat(entry.published), - updated = datetime.fromisoformat(entry.updated), - thumbnail = entry.media_thumbnail[0]["url"], - summary = entry.summary, - )) + rss = parse(self.link) except Exception as e: - print("Ran into exception", e, file=stderr) + print("Ran into an exception while fetching", self._id + ":", e, file=stderr) + return + for vid in map(VideoTuple.from_rss_entry, rss.entries): + if vid.published > self.last_video_update: + self.videos.append(vid) + elif vid.updated > self.last_video_update: + for i, old_vid in enumerate(self.videos): + if vid.id == old_vid.id: + self.videos[i] = vid + break + last_video_update = max((vid.updated for vid in self.videos)) + if last_video_update > self.last_video_update: + print("Updating", self._id) + self.last_video_update = last_video_update + self.update_fields(["videos", "last_video_update"]) + self.last_fetch = datetime.now(tz=UTC) + + def update_fields(self, fields: List[str]) -> UpdateResult: + sub = asdict(self) + if self._in_db: + return self._collection.update_one( + {"_id": self._id}, + {"$set": {key: sub[key] for key in fields}}, + ) + self._in_db = True + return self._collection.replace_one({"_id": self._id}, sub, upsert=True) diff --git a/data-collection/components/subscriptions/typing.py b/data-collection/components/subscriptions/typing.py index eebcece..8f2a298 100644 --- a/data-collection/components/subscriptions/typing.py +++ b/data-collection/components/subscriptions/typing.py @@ -4,9 +4,10 @@ from bson.objectid import ObjectId from components.videos import VideoTuple class SubsDict(TypedDict): - id: str + _id: str link: str - time_between_fetches: int # In hours. - last_update: datetime + time_between_fetches: int # In minutes. + last_fetch: datetime + last_video_update: datetime videos: List[VideoTuple] subscribers: List[ObjectId] diff --git a/data-collection/components/videos.py b/data-collection/components/videos.py index 71937b2..32dc1da 100644 --- a/data-collection/components/videos.py +++ b/data-collection/components/videos.py @@ -1,4 +1,4 @@ -from typing import NamedTuple +from typing import NamedTuple, Any, Self from datetime import datetime class VideoTuple(NamedTuple): @@ -9,3 +9,15 @@ class VideoTuple(NamedTuple): updated: datetime thumbnail: str summary: str + + @classmethod + def from_rss_entry(cls, entry: Any) -> Self: + return cls( + id = entry.id, + link = entry.link, + title = entry.title, + published = datetime.fromisoformat(entry.published), + updated = datetime.fromisoformat(entry.updated), + thumbnail = entry.media_thumbnail[0]["url"], + summary = entry.summary, + ) |
