-
-
Notifications
You must be signed in to change notification settings - Fork 748
fix: Handling of site connection issues during outage #3482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9d344a8
7a66cd4
26d30a8
8c3f009
9253760
96e3018
0bfb84e
0995803
2fb991a
2fe59dd
6216c17
685788e
7727c30
aa9ec7a
ea30b97
b5c99bf
bcf9cb9
200a4d8
4336867
9ce2efa
bac8217
4f209a6
3d0ad6c
c989eae
81f8d34
3b8d7eb
f1397a2
c27fb5a
0b89bdb
642f699
8ecd602
768b2fa
7dd05a0
e534601
af38322
4e203c2
989b257
ae6cd0d
8428c67
b5782ab
50cdacc
f22028b
f4d398b
43f1c5b
8b575f6
2cbb226
f10d6fa
92fc442
71bf092
0f81c17
feff4b2
e38f365
e098a66
4754f19
6d9fc0c
e1c3796
49e9639
abaccdf
673d486
97cdf38
972d298
015927e
2159edb
cd46116
1afa433
cc59caf
62d115a
1e733d5
47ff624
3e87546
6805c9a
e64ccf4
1e27dde
7bfd50d
cc0f3a1
3700aee
4d78a81
8d4d0a0
7a637c3
4e40408
c0ced04
a1f890f
f83a2da
a3a87d6
a453851
3f3fa8f
7e20fee
4b196cc
7d0f75d
1704c9f
8428cac
0d14e70
02e89ed
955682d
8310588
1524aff
b5fc5b2
4d73aad
3642840
695206e
791bb8e
1d977ed
061163b
768d22f
e67dc2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import asyncio | ||
| import datetime | ||
| import io | ||
| import json | ||
|
|
@@ -24,7 +25,7 @@ | |
| import bot.exts.filtering._ui.filter as filters_ui | ||
| from bot import constants | ||
| from bot.bot import Bot | ||
| from bot.constants import BaseURLs, Channels, Guild, MODERATION_ROLES, Roles | ||
| from bot.constants import BaseURLs, Channels, Guild, MODERATION_ROLES, Roles, URLs | ||
| from bot.exts.backend.branding._repository import HEADERS, PARAMS | ||
| from bot.exts.filtering._filter_context import Event, FilterContext | ||
| from bot.exts.filtering._filter_lists import FilterList, ListType, ListTypeConverter, filter_list_types | ||
|
|
@@ -55,6 +56,7 @@ | |
| from bot.utils.channel import is_mod_channel | ||
| from bot.utils.lock import lock_arg | ||
| from bot.utils.message_cache import MessageCache | ||
| from bot.utils.retry import is_retryable_api_error | ||
|
|
||
| log = get_logger(__name__) | ||
|
|
||
|
|
@@ -108,7 +110,31 @@ async def cog_load(self) -> None: | |
| await self.bot.wait_until_guild_available() | ||
|
|
||
| log.trace("Loading filtering information from the database.") | ||
| raw_filter_lists = await self.bot.api_client.get("bot/filter/filter_lists") | ||
| for attempt in range(1, URLs.connect_max_retries + 1): | ||
| try: | ||
| raw_filter_lists = await self.bot.api_client.get("bot/filter/filter_lists") | ||
| break | ||
| except Exception as error: | ||
| is_retryable = is_retryable_api_error(error) | ||
| is_last_attempt = attempt == URLs.connect_max_retries | ||
|
|
||
| if not is_retryable: | ||
| raise | ||
|
|
||
| if is_last_attempt: | ||
| log.exception("Failed to load filtering data after %d attempts.", URLs.connect_max_retries) | ||
| raise | ||
|
|
||
| backoff_seconds = URLs.connect_initial_backoff * (2 ** (attempt - 1)) | ||
| log.warning( | ||
| "Failed to load filtering data (attempt %d/%d). Retrying in %d second(s): %s", | ||
| attempt, | ||
| URLs.connect_max_retries, | ||
| backoff_seconds, | ||
| error | ||
| ) | ||
| await asyncio.sleep(backoff_seconds) | ||
|
|
||
|
Comment on lines
+113
to
+137
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you repeated this backoff logic many times. Consider moving it to self.bot.api_client.get()
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, Thank you for your review, I’m a bit hesitant to put retries directly into If you think this might not be the problem, I can definately move it to the |
||
| example_list = None | ||
| for raw_filter_list in raw_filter_lists: | ||
| loaded_list = self._load_raw_filter_list(raw_filter_list) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import asyncio | ||
| import re | ||
| import typing as t | ||
| from datetime import UTC, datetime, timedelta | ||
|
|
@@ -12,7 +13,9 @@ | |
|
|
||
| from bot import constants | ||
| from bot.bot import Bot | ||
| from bot.constants import URLs | ||
| from bot.log import get_logger | ||
| from bot.utils.retry import is_retryable_api_error | ||
| from bot.utils.webhooks import send_webhook | ||
|
|
||
| PEPS_RSS_URL = "https://peps.python.org/peps.rss" | ||
|
|
@@ -46,19 +49,45 @@ def __init__(self, bot: Bot): | |
|
|
||
| async def cog_load(self) -> None: | ||
| """Load all existing seen items from db and create any missing mailing lists.""" | ||
| with sentry_sdk.start_span(description="Fetch mailing lists from site"): | ||
| response = await self.bot.api_client.get("bot/mailing-lists") | ||
|
|
||
| for mailing_list in response: | ||
| self.seen_items[mailing_list["name"]] = set(mailing_list["seen_items"]) | ||
|
|
||
| with sentry_sdk.start_span(description="Update site with new mailing lists"): | ||
| for mailing_list in ("pep", *constants.PythonNews.mail_lists): | ||
| if mailing_list not in self.seen_items: | ||
| await self.bot.api_client.post("bot/mailing-lists", json={"name": mailing_list}) | ||
| self.seen_items[mailing_list] = set() | ||
|
|
||
| self.fetch_new_media.start() | ||
| for attempt in range(1, URLs.connect_max_retries + 1): | ||
| try: | ||
| with sentry_sdk.start_span(description="Fetch mailing lists from site"): | ||
| response = await self.bot.api_client.get("bot/mailing-lists") | ||
|
|
||
| # Rebuild state on each successful fetch (avoid partial state across retries) | ||
| self.seen_items = {} | ||
| for mailing_list in response: | ||
| self.seen_items[mailing_list["name"]] = set(mailing_list["seen_items"]) | ||
|
|
||
| with sentry_sdk.start_span(description="Update site with new mailing lists"): | ||
| for mailing_list in ("pep", *constants.PythonNews.mail_lists): | ||
| if mailing_list not in self.seen_items: | ||
| await self.bot.api_client.post("bot/mailing-lists", json={"name": mailing_list}) | ||
| self.seen_items[mailing_list] = set() | ||
|
|
||
| self.fetch_new_media.start() | ||
| return | ||
|
Comment on lines
+58
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this in the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, The intent was to make the whole startup sync step atomic and retry on transient API failures from both the
also the |
||
|
|
||
| except Exception as error: | ||
| if not is_retryable_api_error(error): | ||
| raise | ||
|
|
||
| if attempt == URLs.connect_max_retries: | ||
| log.exception( | ||
| "Failed to load PythonNews mailing lists after %d attempt(s).", | ||
| URLs.connect_max_retries, | ||
| ) | ||
| raise | ||
|
|
||
| backoff_seconds = URLs.connect_initial_backoff * (2 ** (attempt - 1)) | ||
| log.warning( | ||
| "Failed to load PythonNews mailing lists (attempt %d/%d). Retrying in %d second(s). Error: %s", | ||
| attempt, | ||
| URLs.connect_max_retries, | ||
| backoff_seconds, | ||
| error, | ||
| ) | ||
| await asyncio.sleep(backoff_seconds) | ||
|
|
||
| async def cog_unload(self) -> None: | ||
| """Stop news posting tasks on cog unload.""" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| from pydis_core.site_api import ResponseCodeError | ||
|
|
||
|
|
||
| def is_retryable_api_error(error: Exception) -> bool: | ||
| """Return whether an API error is temporary and worth retrying.""" | ||
| if isinstance(error, ResponseCodeError): | ||
| return error.status in (408, 429) or error.status >= 500 | ||
|
|
||
| return isinstance(error, (TimeoutError, OSError)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason this is defined within _load_extensions instead of it being a method of Bot?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi,
I defined
_load_oneinside_load_extensionsbecause it’s only used there. I didn't realized we could move it to privateBotmethod.If you want, I’m happy to move it to a private
Botmethod (e.g._load_single_extension). Shall I?