diff --git a/README.md b/README.md index 0e9280d..2455040 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,18 @@ sudo apt install apport-retrace python3-amqp python3-bson python3-cassandra pyth sudo apt install python3-django-tastypie python3-numpy ``` -Then start a local Cassandra, RabbitMQ and swift (`docker` should works fine too): +Then start a local Cassandra, RabbitMQ and swift (`docker` should work fine too): ``` podman run --name cassandra --network host --rm -d -e HEAP_NEWSIZE=10M -e MAX_HEAP_SIZE=200M docker.io/cassandra podman run --name rabbitmq --network host --rm -d docker.io/rabbitmq podman run --name swift --network host --rm -d docker.io/openstackswift/saio ``` +> Note: +> * Cassandra can take some time (a minute or two?) to fully start. +> * Also, sometimes, Cassandra can hang and you get some `OperationTimedOut` +> issues out of nowhere. Just `podman kill cassandra` and restart it. + You can then then run the tests with `pytest`: ``` cd src diff --git a/src/daisy/submit.py b/src/daisy/submit.py index 5e31c8c..f0cf3c3 100644 --- a/src/daisy/submit.py +++ b/src/daisy/submit.py @@ -36,22 +36,6 @@ logger = logging.getLogger("daisy") -def update_counters(release, src_package, date, src_version=None): - if src_version: - key = "%s:%s:%s" % (release, src_package, src_version) - else: - key = "%s:%s" % (release, src_package) - cassandra_schema.Counters(key=key.encode(), column1=date).update(value=1) - - -def update_proposed_counters(release, src_package, date, src_version=None): - if src_version: - key = "%s:%s:%s" % (release, src_package, src_version) - else: - key = "%s:%s" % (release, src_package) - cassandra_schema.CountersForProposed(key=key.encode(), column1=date).update(value=1) - - def create_minimal_report_from_bson(data): report = Report() for key in data: @@ -181,12 +165,6 @@ def submit(request, system_token): pkg_arch = utils.get_package_architecture(data) problem_type = data.get("ProblemType", "") apport_version = data.get("ApportVersion", "") - third_party = False - if not utils.retraceable_package(package): - third_party = True - automated_testing = False - if system_token.startswith("deadbeef"): - automated_testing = True if not release: metrics.meter("missing.missing_release") @@ -221,21 +199,6 @@ def submit(request, system_token): problem_type, release, package, version, pkg_arch ) - # generic counter for crashes about a source package which is used by the - # phased-updater and only includes official Ubuntu packages and not those - # crahses from systems under auto testing. - if not third_party and not automated_testing and problem_type == "Crash": - update_counters(release=release, src_package=src_package, date=day_key) - if version == "": - metrics.meter("missing.missing_package_version") - else: - update_counters( - release=release, - src_package=src_package, - src_version=version, - date=day_key, - ) - # ProcMaps is useful for creating a crash sig, not after that if "Traceback" in data and "ProcMaps" in data: data.pop("ProcMaps") @@ -262,18 +225,6 @@ def submit(request, system_token): package_from_proposed = False if "package-from-proposed" in tags: package_from_proposed = True - # generic counter for crashes about a source package which is used by - # the phased-updater and only includes official Ubuntu packages and - # not those from systems under auto testing. - if not third_party and not automated_testing and problem_type == "Crash": - update_proposed_counters(release=release, src_package=src_package, date=day_key) - if version != "": - update_proposed_counters( - release=release, - src_package=src_package, - src_version=version, - date=day_key, - ) # A device is manually blocklisted if it has repeatedly failed to have an # crash inserted into the OOPS table. @@ -356,7 +307,7 @@ def bucket(oops_id, data, day_key): key=b"crash_signature_for_stacktrace_address_signature", column1=addr_sig ).value.decode() except DoesNotExist: - pass + metrics.meter("missing.crash_signature") failed_to_retrace = False if crash_sig.startswith("failed:"): failed_to_retrace = True @@ -401,7 +352,9 @@ def bucket(oops_id, data, day_key): "StacktraceTop", ) for unneeded_column in unneeded_columns: - cassandra_schema.OOPS.filter(key=oops_id.encode(), column1=unneeded_column).delete() + cassandra_schema.OOPS.filter( + key=oops_id.encode(), column1=unneeded_column + ).delete() # We have already retraced for this address signature, so this # crash can be immediately bucketed. utils.bucket(oops_id, crash_sig, data) diff --git a/src/errors/api/resources.py b/src/errors/api/resources.py index dfad4b6..db76b6f 100644 --- a/src/errors/api/resources.py +++ b/src/errors/api/resources.py @@ -1,23 +1,8 @@ # Treat strings as UTF-8 instead of ASCII -import importlib -import sys -from functools import cmp_to_key - -importlib.reload(sys) - -from tastypie import fields -from tastypie.authentication import Authentication, SessionAuthentication -from tastypie.authorization import Authorization, DjangoAuthorization -from tastypie.exceptions import NotFound -from tastypie.resources import Resource - -from errors import cassie - -TASTYPIE_FULL_DEBUG = True - import datetime import json as simplejson from collections import OrderedDict +from functools import cmp_to_key from hashlib import sha1 from operator import itemgetter from urllib.error import HTTPError @@ -25,8 +10,14 @@ import apt from django.core.serializers import json +from tastypie import fields +from tastypie.authentication import Authentication, SessionAuthentication +from tastypie.authorization import Authorization, DjangoAuthorization +from tastypie.exceptions import NotFound +from tastypie.resources import Resource from tastypie.serializers import Serializer +from errors import cassie from errortracker import config, launchpad from ..metrics import measure_view @@ -313,7 +304,7 @@ def obj_get(self, **kwargs): oopses_by_day = set() oopses_by_release = set() for oops in cassie.get_oopses_by_day(date, limit): - oopses_by_day.add(str(oops)) + oopses_by_day.add(oops) oopses = oopses_by_day if release: @@ -614,10 +605,6 @@ def __getslice__(klass, start, finish): # TODO: use a cassandra function that does a multiget of the # crashes for crash, ts in crashes: - # cassandra records time in microseconds, convert to - # seconds - ts = (ts["submitted"][1]) * 1e-6 - ts = datetime.datetime.utcfromtimestamp(ts) d = cassie.get_crash(str(crash), columns=cols) program = split_package_and_version(d.get("Package", ""))[0] if not program: @@ -807,7 +794,7 @@ def __getslice__(klass, start, finish): if item[0] in results: results[item[0]] -= item[1] results = sorted( - list(results.items()), key=cmp_to_key(lambda x, y: cmp(x[0], y[0])) + list(results.items()), key=cmp_to_key(lambda x, y: x[0] <= y[0]) ) res = [{"x": result[0] * 1000, "y": result[1]} for result in results] diff --git a/src/errors/auth.py b/src/errors/auth.py index 3fbd63a..9b4bf70 100644 --- a/src/errors/auth.py +++ b/src/errors/auth.py @@ -15,5 +15,5 @@ def can_see_stacktraces(func): def in_groups(u): return u.groups.filter(name__in=groups).count() > 0 - l = "/login-failed" - return login_required(user_passes_test(in_groups, login_url=l)(func)) + + return login_required(user_passes_test(in_groups, login_url="/login-failed")(func)) diff --git a/src/errors/cassie.py b/src/errors/cassie.py index fccd9c0..06a3493 100644 --- a/src/errors/cassie.py +++ b/src/errors/cassie.py @@ -1,21 +1,39 @@ import datetime -import operator -import sys import time -import urllib.error import urllib.parse -import urllib.request -from functools import cmp_to_key +from uuid import UUID +import distro_info import numpy +from cassandra.util import datetime_from_uuid1 -# TODO: port that to the cassandra module -# import pycassa -# from pycassa.cassandra.ttypes import NotFoundException -# from pycassa.util import OrderedDict from errortracker import cassandra, config - -session = cassandra.cassandra_session() +from errortracker.cassandra_schema import ( + OOPS, + Bucket, + BucketMetadata, + BucketRetraceFailureReason, + BucketVersionsCount, + BucketVersionSystems2, + BugToCrashSignatures, + Counters, + CountersForProposed, + DayBucketsCount, + DayOOPS, + DoesNotExist, + ErrorsByRelease, + Hashes, + Indexes, + RetraceStats, + SourceVersionBuckets, + Stacktrace, + SystemImages, + UniqueUsers90Days, + UserBinaryPackages, + UserOOPS, +) + +session = cassandra.cassandra_session def _split_into_dictionaries(original): @@ -27,42 +45,38 @@ def _split_into_dictionaries(original): return value -def _get_range_of_dates(start, finish): +def _get_range_of_dates(start_x_days_ago: int, finish_x_days_ago: int) -> list[str]: """Get a range of dates from start to finish. This is necessary because we use the Cassandra random partitioner, so lexicographical ranges are not possible.""" - finish = finish - start - date = datetime.datetime.utcnow() - datetime.timedelta(days=start) + finish_x_days_ago = finish_x_days_ago - start_x_days_ago + date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=start_x_days_ago) delta = datetime.timedelta(days=1) dates = [] - for i in range(finish): + for i in range(finish_x_days_ago): dates.append(date.strftime("%Y%m%d")) date = date - delta return dates -def get_oopses_by_day(date, limit=1000): +def get_oopses_by_day(date: str, limit: int = 1000): """All of the OOPSes in the given day.""" - oopses_by_day = session.prepare('SELECT value FROM crashdb."DayOOPS" WHERE key = ? LIMIT ?;') - for row in session.execute(oopses_by_day, [date, limit]): - yield row.value + for row in DayOOPS.objects.filter(key=date.encode()).limit(limit): + yield row.column1 -def get_oopses_by_release(release, limit=1000): +def get_oopses_by_release(release: str, limit: int = 1000): """All of the OOPSes in the given release.""" - oopses_by_release = session.prepare( - 'SELECT column1 FROM crashdb."ErrorsByRelease" WHERE key = ? LIMIT ? ALLOW FILTERING;' - ) - for row in session.execute(oopses_by_release, [release.encode(), limit]): + for row in ErrorsByRelease.objects.filter(key=release).limit(limit): yield row.column1 -def get_total_buckets_by_day(start, finish): +def get_total_buckets_by_day(start: int, finish: int): """All of the buckets added to for the past seven days.""" - daybucketscount_cf = pycassa.ColumnFamily(pool, "DayBucketsCount") dates = _get_range_of_dates(start, finish) for date in dates: - yield (date, daybucketscount_cf.get_count(date)) + count = DayBucketsCount.objects.filter(key=date.encode()).count() + yield (date, count) def _date_range_iterator(start, finish): @@ -93,7 +107,6 @@ def get_bucket_counts( """The number of times each bucket has been added to today, this month, or this year.""" - daybucketscount_cf = pycassa.ColumnFamily(pool, "DayBucketsCount") periods = "" if period: if period == "today" or period == "day": @@ -150,84 +163,89 @@ def get_bucket_counts( keys.append(key) results = {} - batch_size = 500 for key in keys: - start = "" - while True: - try: - result = daybucketscount_cf.get(key, column_start=start, column_count=batch_size) - except NotFoundException: - break - - for column, count in result.items(): + try: + rows = DayBucketsCount.objects.filter(key=key.encode()).all() + for row in rows: + column = row.column1 + count = row.value if not show_failed and column.startswith("failed"): continue - column = column.encode("utf-8") + if isinstance(column, str): + column = column.encode("utf-8") try: existing = results[column] except KeyError: existing = 0 results[column] = count + existing - # We do not want to include the end of the previous batch. - start = column + "0" - if len(result) < batch_size: - break - return sorted( - list(results.items()), key=cmp_to_key(lambda x, y: cmp(x[1], y[1])), reverse=True - ) + except DoesNotExist: + continue + return sorted(list(results.items()), key=lambda x: x[1], reverse=True) -def get_crashes_for_bucket(bucketid, limit=100, start=None): + +def get_crashes_for_bucket(bucketid: str, limit: int = 100, start: str = None) -> list[UUID]: """ Get limit crashes for the provided bucket, starting at start. We show the most recent crashes first, since they'll be the most relevant to the current state of the problem. """ - bucket_cf = pycassa.ColumnFamily(pool, "Bucket") try: + query = Bucket.objects.filter(key=bucketid).order_by("-column1") if start: - start = pycassa.util.uuid.UUID(start) - return list( - bucket_cf.get( - bucketid, column_start=start, column_count=limit, column_reversed=True - ).keys() - )[1:] - else: - return list(bucket_cf.get(bucketid, column_count=limit, column_reversed=True).keys()) - except NotFoundException: + start_uuid = UUID(start) + # Get items less than start (because of reversed ordering) + query = query.filter(column1__lt=start_uuid) + + return [row.column1 for row in list(query.limit(limit).all())] + except DoesNotExist: return [] def get_package_for_bucket(bucketid): """Returns the package and version for a given bucket.""" - bucket_cf = pycassa.ColumnFamily(pool, "Bucket") - oops_cf = pycassa.ColumnFamily(pool, "OOPS") - # Grab 5 OOPS IDs, just in case the first one doesn't have a Package field. + # Grab 50 OOPS IDs, just in case the first one doesn't have a Package field. try: - oopsids = list(bucket_cf.get(bucketid, column_count=5).keys()) - except NotFoundException: + rows = Bucket.objects.filter(key=bucketid).limit(50).all() + oopsids = [row.column1 for row in rows] + except DoesNotExist: return ("", "") + for oopsid in oopsids: try: - oops = oops_cf.get(str(oopsid), columns=["Package"]) - package_and_version = oops["Package"].split()[:2] - if len(package_and_version) == 1: - return (package_and_version[0], "") - else: - return package_and_version - except (KeyError, NotFoundException): + oops_rows = OOPS.objects.filter(key=str(oopsid).encode(), column1="Package").all() + for row in oops_rows: + value = row.value + if isinstance(value, bytes): + value = value.decode("utf-8") + package_and_version = value.split()[:2] + if len(package_and_version) == 1: + return (package_and_version[0], "") + else: + return tuple(package_and_version) + except (KeyError, DoesNotExist): continue return ("", "") def get_crash(oopsid, columns=None): - oops_cf = pycassa.ColumnFamily(pool, "OOPS") try: - oops = oops_cf.get(oopsid, columns=columns) - except NotFoundException: + query = OOPS.objects.filter(key=oopsid.encode()) + if columns: + # Filter by specific columns + query = query.filter(column1__in=columns) + + oops = {} + for row in query.all(): + oops[row.column1] = row.value + + if not oops: + return {} + except DoesNotExist: return {} + if "StacktraceAddressSignature" in oops: SAS = oops["StacktraceAddressSignature"] if not SAS: @@ -239,117 +257,102 @@ def get_crash(oopsid, columns=None): return oops else: return oops + try: - indexes_cf = pycassa.ColumnFamily(pool, "Indexes") - idx = "crash_signature_for_stacktrace_address_signature" - bucket = indexes_cf.get(idx, [SAS]) - oops["SAS"] = bucket[SAS] + index_key = b"crash_signature_for_stacktrace_address_signature" + index_rows = Indexes.objects.filter(key=index_key, column1=SAS).all() + for row in index_rows: + oops["SAS"] = row.value.decode() if isinstance(row.value, bytes) else row.value + break return oops - except NotFoundException: + except DoesNotExist: return oops - return oops def get_traceback_for_bucket(bucketid): - oops_cf = pycassa.ColumnFamily(pool, "OOPS") # TODO fetching a crash ID twice, once here and once in get_stacktrace, is # a bit rubbish, but we'll write the stacktrace into the bucket at some # point and get rid of the contents of both of these functions. - if len(get_crashes_for_bucket(bucketid, 1)) == 0: + crashes = get_crashes_for_bucket(bucketid, 1) + if len(crashes) == 0: return None - crash = str(get_crashes_for_bucket(bucketid, 1)[0]) + crash = str(crashes[0]) try: - return oops_cf.get(crash, columns=["Traceback"])["Traceback"] - except NotFoundException: + rows = OOPS.objects.filter(key=crash.encode(), column1="Traceback").all() + for row in rows: + return row.value + return None + except DoesNotExist: return None -def get_stacktrace_for_bucket(bucketid): - stacktrace_cf = pycassa.ColumnFamily(pool, "Stacktrace") - oops_cf = pycassa.ColumnFamily(pool, "OOPS") +def get_stacktrace_for_bucket(bucketid: str): # TODO: we should build some sort of index for this. SAS = "StacktraceAddressSignature" cols = ["Stacktrace", "ThreadStacktrace"] for crash in get_crashes_for_bucket(bucketid, 10): sas = None try: - sas = oops_cf.get(str(crash), columns=[SAS])[SAS] - except NotFoundException: + rows = OOPS.objects.filter(key=str(crash).encode(), column1=SAS).all() + for row in rows: + sas = row.value + break + except DoesNotExist: pass if not sas: continue try: - traces = stacktrace_cf.get(sas, columns=cols) + traces = {} + for col in cols: + trace_rows = Stacktrace.objects.filter(key=sas.encode(), column1=col).all() + for row in trace_rows: + traces[col] = row.value return (traces.get("Stacktrace", None), traces.get("ThreadStacktrace", None)) - except NotFoundException: + except DoesNotExist: pass - # We didn't have a stack trace for any of the signatures in this set of - # crashes. - # TODO in the future, we should go to the next 10 crashes. - # fixing this would make a stacktrace appear for - # https://errors.ubuntu.com/problem/24c9ba23fb469a953e7624b1dfb8fdae97c45618 return (None, None) -def get_retracer_count(date): - retracestats_cf = pycassa.ColumnFamily(pool, "RetraceStats") - result = retracestats_cf.get(date) - return _split_into_dictionaries(result) +def get_retracer_count(date: str): + try: + result = RetraceStats.get_as_dict(key=date.encode()) + return _split_into_dictionaries(result) + except DoesNotExist: + return {} def get_retracer_counts(start, finish): - retracestats_cf = pycassa.ColumnFamily(pool, "RetraceStats") - if finish == sys.maxsize: - start = datetime.date.today() - datetime.timedelta(days=start) - start = start.strftime("%Y%m%d") - results = retracestats_cf.get_range() - return ( - (date, _split_into_dictionaries(result)) for date, result in results if date < start - ) - else: - dates = _get_range_of_dates(start, finish) - results = retracestats_cf.multiget(dates) - return ((date, _split_into_dictionaries(results[date])) for date in results) + dates = _get_range_of_dates(start, finish) + results = {} + for date in dates: + try: + result = RetraceStats.get_as_dict(key=date.encode()) + results[date] = result + except DoesNotExist: + pass + return ((date, _split_into_dictionaries(results[date])) for date in results) def get_retracer_means(start, finish): - indexes_cf = pycassa.ColumnFamily(pool, "Indexes") - start = datetime.date.today() - datetime.timedelta(days=start) - start = start.strftime("%Y%m%d") - finish = datetime.date.today() - datetime.timedelta(days=finish) - finish = finish.strftime("%Y%m%d") - - # FIXME: We shouldn't be specifying a maximum number of columns - timings = indexes_cf.get( - "mean_retracing_time", - column_start=start, - column_finish=finish, - column_count=1000, - column_reversed=True, - ) - to_float = pycassa.marshal.unpacker_for("FloatType") - result = OrderedDict() - for timing in timings: - if not timing.endswith(":count"): - branch = result - parts = timing.split(":") - # If you go far enough back, you'll hit the point before we - # included the architecture in this CF, which will break here. - # This is because there's a day that has some retracers for all - # archs, and some for just i386. - if len(parts) < 3: - parts.append("all") - end = parts[-1] - for part in parts: - if part is end: - branch[part] = to_float(timings[timing]) - else: - branch = branch.setdefault(part, {}) - return iter(result.items()) + dates = _get_range_of_dates(start, finish) + results = list() + for date in dates: + result = {} + for release in distro_info.UbuntuDistroInfo().supported(result="object"): + release = "Ubuntu " + release.version.replace(" LTS", "") + result[release] = {} + for arch in ["amd64", "arm64", "armhf", "i386"]: + try: + key = f"{date}:{release}:{arch}" + timings = Indexes.get_as_dict(key=b"mean_retracing_time", column1=key) + result[release][arch] = timings[key] + except (DoesNotExist, IndexError): + pass + results.append((date, result)) + return results def get_crash_count(start, finish, release=None): - counters_cf = pycassa.ColumnFamily(pool, "Counters") dates = _get_range_of_dates(start, finish) for date in dates: try: @@ -357,97 +360,87 @@ def get_crash_count(start, finish, release=None): key = "oopses:%s" % release else: key = "oopses" - oopses = int(counters_cf.get(key, columns=[date])[date]) - yield (date, oopses) - except NotFoundException: + rows = Counters.objects.filter(key=key.encode(), column1=date).all() + for row in rows: + oopses = int(row.value) + yield (date, oopses) + break + except DoesNotExist: pass -def get_metadata_for_bucket(bucketid, release=None): - bucketmetadata_cf = pycassa.ColumnFamily(pool, "BucketMetadata") +def get_metadata_for_bucket(bucketid: str, release: str = None): try: if not release: - return bucketmetadata_cf.get(bucketid, column_finish="~") + # Get all columns up to "~" (non-inclusive) + rows = BucketMetadata.objects.filter(key=bucketid.encode(), column1__lt="~").all() else: - ret = bucketmetadata_cf.get(bucketid) + rows = BucketMetadata.objects.filter(key=bucketid.encode()).all() + + ret = {} + for row in rows: + ret[row.column1] = row.value + + if release and ret: try: ret["FirstSeen"] = ret["~%s:FirstSeen" % release] + except KeyError: + pass + try: ret["LastSeen"] = ret["~%s:LastSeen" % release] except KeyError: pass - return ret - except NotFoundException: + return ret + except DoesNotExist: return {} -def chunks(l, n): - # http://stackoverflow.com/a/312464/190597 - """Yield successive n-sized chunks from l.""" - for i in range(0, len(l), n): - yield l[i : i + n] - - def get_metadata_for_buckets(bucketids, release=None): - bucketmetadata_cf = pycassa.ColumnFamily(pool, "BucketMetadata") - ret = OrderedDict() - for buckets in chunks(bucketids, 5): - if not release: - ret.update(bucketmetadata_cf.multiget(buckets, column_finish="~")) - else: - ret.update(bucketmetadata_cf.multiget(buckets)) - if release: - for bucket in ret: - bucket = ret[bucket] - try: - bucket["FirstSeen"] = bucket["~%s:FirstSeen" % release] - bucket["LastSeen"] = bucket["~%s:LastSeen" % release] - except KeyError: - # Rather than confuse developers with half release-specific - # data. Of course this will only apply for the current row, so - # it's possible subsequent rows will show release-specific - # data. - if "FirstSeen" in bucket: - del bucket["FirstSeen"] - if "LastSeen" in bucket: - del bucket["LastSeen"] + ret = dict() + for bucketid in bucketids: + ret[bucketid] = get_metadata_for_bucket(bucketid, release) return ret -def get_user_crashes(user_token, limit=50, start=None): - useroops_cf = pycassa.ColumnFamily(pool, "UserOOPS") +def get_user_crashes(user_token: str, limit: int = 50, start=None): results = {} try: + query = UserOOPS.objects.filter(key=user_token.encode()).limit(limit).order_by("-column1") + if start: - start = pycassa.util.uuid.UUID(start) - result = useroops_cf.get( - user_token, column_start=start, column_count=limit, include_timestamp=True - ) - else: - result = useroops_cf.get(user_token, column_count=limit, include_timestamp=True) - for r in result: - results[r] = {"submitted": result[r]} - start = list(result.keys())[-1] + "0" - except NotFoundException: + # Filter to get items lower than start (reverse order) + query = query.filter(column1__lt=start) + + for row in query: + # Since we don't have timestamp directly, we'll use the column1 to compute it + results[row.column1] = datetime_from_uuid1(UUID(row.column1)) + except DoesNotExist: return [] - return [ - (k[0], k[1]) - for k in sorted(iter(results.items()), key=operator.itemgetter(1), reverse=True) - ] + + return [(k, results[k]) for k in results.keys()] def get_average_crashes(field, release, days=7): - uniqueusers_cf = pycassa.ColumnFamily(pool, "UniqueUsers90Days") - counters_cf = pycassa.ColumnFamily(pool, "Counters") dates = _get_range_of_dates(0, days) start = dates[-1] end = dates[0] + try: key = "oopses:%s" % field - g = counters_cf.xget(key, column_start=start, column_finish=end) - oopses = pycassa.util.OrderedDict(x for x in g) - g = uniqueusers_cf.xget(release, column_start=start, column_finish=end) - users = pycassa.util.OrderedDict(x for x in g) - except NotFoundException: + oopses = dict() + oops_rows = Counters.objects.filter( + key=key.encode(), column1__gte=start, column1__lte=end + ).all() + for row in oops_rows: + oopses[row.column1] = row.value + + users = dict() + user_rows = UniqueUsers90Days.objects.filter( + key=release, column1__gte=start, column1__lte=end + ).all() + for row in user_rows: + users[row.column1] = row.value + except DoesNotExist: return [] return_data = [] @@ -462,8 +455,6 @@ def get_average_crashes(field, release, days=7): def get_average_instances(bucketid, release, days=7): - uniqueusers_cf = pycassa.ColumnFamily(pool, "UniqueUsers90Days") - daybucketscount_cf = pycassa.ColumnFamily(pool, "DayBucketsCount") # FIXME Why oh why did we do things this way around? It makes it impossible # to do a quick range scan. We should create DayBucketsCount2, replacing # this with a CF that's keyed on the bucket ID and has counter columns @@ -471,12 +462,23 @@ def get_average_instances(bucketid, release, days=7): dates = _get_range_of_dates(0, days) start = dates[-1] end = dates[0] - gen = uniqueusers_cf.xget(release, column_start=start, column_finish=end) - users = dict(x for x in gen) + + user_rows = UniqueUsers90Days.objects.filter( + key=release, column1__gte=start, column1__lte=end + ).all() + users = {row.column1: row.value for row in user_rows} + for date in dates: try: - count = daybucketscount_cf.get("%s:%s" % (release, date), columns=[bucketid])[bucketid] - except NotFoundException: + key = "%s:%s" % (release, date) + count_rows = DayBucketsCount.objects.filter(key=key.encode(), column1=bucketid).all() + count = None + for row in count_rows: + count = row.value + break + if count is None: + continue + except DoesNotExist: continue try: avg = float(count) / float(users[date]) @@ -486,59 +488,75 @@ def get_average_instances(bucketid, release, days=7): yield ((t, avg)) -def get_versions_for_bucket(bucketid): +def get_versions_for_bucket(bucketid: str): """Get the dictionary of (release, version) tuples for the given bucket with values of their instance counts. If the bucket does not exist, return an empty dict.""" - bv_count_cf = pycassa.ColumnFamily(pool, "BucketVersionsCount") try: - return bv_count_cf.get(bucketid) - except NotFoundException: + rows = BucketVersionsCount.objects.filter(key=bucketid).all() + result = {} + for row in rows: + result[row.column1] = row.column2 + return result + except DoesNotExist: return {} -def get_source_package_for_bucket(bucketid): - oops_cf = pycassa.ColumnFamily(pool, "OOPS") - bucket_cf = pycassa.ColumnFamily(pool, "Bucket") - oopsids = list(bucket_cf.get(bucketid, column_count=10).keys()) +def get_source_package_for_bucket(bucketid: str): + bucket_rows = Bucket.objects.filter(key=bucketid).limit(50).all() + oopsids = [row.column1 for row in bucket_rows] for oopsid in oopsids: try: - oops = oops_cf.get(str(oopsid), columns=["SourcePackage"]) - return oops["SourcePackage"] - except (KeyError, NotFoundException): + oops_rows = OOPS.objects.filter( + key=str(oopsid).encode(), column1="SourcePackage" + ).all() + for row in oops_rows: + return row.value + except (KeyError, DoesNotExist): continue return "" -def get_retrace_failure_for_bucket(bucketid): - bucketretracefail_fam = pycassa.ColumnFamily(pool, "BucketRetraceFailureReason") +def get_retrace_failure_for_bucket(bucketid: str): try: - failuredata = bucketretracefail_fam.get(bucketid) + failuredata = BucketRetraceFailureReason.get_as_dict(key=bucketid.encode()) return failuredata - except NotFoundException: + except DoesNotExist: return {} def get_binary_packages_for_user(user): # query DayBucketsCount to ensure the package has crashes reported about # it rather than returning packages for which there will be no data. - daybucketscount_cf = pycassa.ColumnFamily(pool, "DayBucketsCount") - userbinpkgs_cf = pycassa.ColumnFamily(pool, "UserBinaryPackages") # if a package's last crash was reported more than a month ago then it # won't be returned here, however the package isn't likely to appear in # the most-common-problems. - period = (datetime.date.today() - datetime.timedelta(30)).strftime("%Y%m") + last_month = (datetime.date.today() - datetime.timedelta(30)).strftime("%Y%m") + current_month = (datetime.date.today()).strftime("%Y%m") + binary_packages = [] try: - binary_packages = [pkg[0] + ":%s" % period for pkg in userbinpkgs_cf.xget(user)] - except NotFoundException: + pkg_rows = UserBinaryPackages.objects.filter(key=user).all() + binary_packages = [row.column1 for row in pkg_rows] + except DoesNotExist: return None if len(binary_packages) == 0: return None - results = daybucketscount_cf.multiget_count(binary_packages, max_count=1) - for result in results: - if results[result] == 0: - del results[result] - return [k[0:-7] for k in list(results.keys())] + + results = [] + for pkg in binary_packages: + count = ( + DayBucketsCount.objects.filter(key=(pkg + ":%s" % last_month).encode()) + .limit(1) + .count() + + DayBucketsCount.objects.filter(key=(pkg + ":%s" % current_month).encode()) + .limit(1) + .count() + ) + # only include packages that have recent crashes + if count > 0: + results.append(pkg) + + return results def get_package_crash_rate( @@ -546,50 +564,70 @@ def get_package_crash_rate( ): """Find the rate of Crashes, not other problems, about a package.""" - counters_cf = pycassa.ColumnFamily(pool, "Counters") - proposed_counters_cf = pycassa.ColumnFamily(pool, "CountersForProposed") # the generic counter only includes Crashes for packages from official # Ubuntu sources and from systems not under auto testing - old_vers_column = "%s:%s:%s" % (release, src_package, old_version) - new_vers_column = "%s:%s:%s" % (release, src_package, new_version) + old_vers_column = "oopses:Crash:%s:%s:%s" % (release, src_package, old_version) + new_vers_column = "oopses:Crash:%s:%s:%s" % (release, src_package, new_version) results = {} + try: - # The first thing done is the reversing of the order that's why it - # is column_start - old_vers_data = counters_cf.get( - old_vers_column, column_start=date, column_reversed=True, column_count=15 + old_rows = ( + Counters.objects.filter(key=old_vers_column.encode(), column1__lte=date) + .order_by("-column1") + .limit(15) + .all() ) - except NotFoundException: + old_vers_data = {row.column1: row.value for row in old_rows} + except DoesNotExist: old_vers_data = None + try: # this may be unnecessarily long since updates phase in ~3 days - new_vers_data = counters_cf.get(new_vers_column, column_reversed=True, column_count=15) - except NotFoundException: + new_rows = ( + Counters.objects.filter(key=new_vers_column.encode()) + .order_by("-column1") + .limit(15) + .all() + ) + new_vers_data = {row.column1: row.value for row in new_rows} + except DoesNotExist: results["increase"] = False return results + + if not new_vers_data: + results["increase"] = False + return results + if exclude_proposed: try: - # The first thing done is the reversing of the order that's why it - # is column_start - proposed_old_vers_data = proposed_counters_cf.get( - old_vers_column, column_start=date, column_reversed=True, column_count=15 + proposed_old_rows = ( + CountersForProposed.objects.filter(key=old_vers_column.encode(), column1__lte=date) + .order_by("-column1") + .limit(15) + .all() ) - except NotFoundException: + proposed_old_vers_data = {row.column1: row.value for row in proposed_old_rows} + except DoesNotExist: proposed_old_vers_data = None try: - # this may be unnecessarily long since updates phase in ~3 days - proposed_new_vers_data = proposed_counters_cf.get( - new_vers_column, column_reversed=True, column_count=15 + proposed_new_rows = ( + CountersForProposed.objects.filter(key=new_vers_column.encode()) + .order_by("-column1") + .limit(15) + .all() ) - except NotFoundException: + proposed_new_vers_data = {row.column1: row.value for row in proposed_new_rows} + except DoesNotExist: proposed_new_vers_data = None - today = datetime.datetime.utcnow().strftime("%Y%m%d") + + today = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d") try: today_crashes = new_vers_data[today] except KeyError: # no crashes today so not an increase results["increase"] = False return results + # subtract CountersForProposed data from today crashes if exclude_proposed and proposed_new_vers_data: try: @@ -601,6 +639,7 @@ def get_package_crash_rate( # no crashes today so not an increase results["increase"] = False return results + if new_vers_data and not old_vers_data: results["increase"] = True results["previous_average"] = None @@ -613,6 +652,7 @@ def get_package_crash_rate( ) results["web_link"] = absolute_uri + web_link return results + first_date = date oldest_date = list(old_vers_data.keys())[-1] dates = [x for x in _date_range_iterator(oldest_date, first_date)] @@ -633,11 +673,13 @@ def get_package_crash_rate( # the day doesn't exist so there were 0 errors except KeyError: previous_vers_crashes.append(0) + results["increase"] = False # 2 crashes may be a fluke if today_crashes < 3: return results - now = datetime.datetime.utcnow() + + now = datetime.datetime.now(datetime.timezone.utc) hour = float(now.hour) minute = float(now.minute) mean_crashes = numpy.average(previous_vers_crashes) @@ -668,33 +710,32 @@ def get_package_crash_rate( return results -def get_package_new_buckets(src_pkg, previous_version, new_version): - srcversionbuckets_cf = pycassa.ColumnFamily(pool, "SourceVersionBuckets") - bucketversionsystems_cf = pycassa.ColumnFamily(pool, "BucketVersionSystems2") +def get_package_new_buckets(src_pkg: str, previous_version: str, new_version: str): results = [] + # new version has no buckets try: - n_data = [bucket[0] for bucket in srcversionbuckets_cf.xget((src_pkg, new_version))] - except KeyError: + new_rows = SourceVersionBuckets.objects.filter(key=src_pkg, key2=new_version).all() + n_data = [row.column1 for row in new_rows] + except (KeyError, DoesNotExist): return results + # if previous version has no buckets return an empty list try: - p_data = [bucket[0] for bucket in srcversionbuckets_cf.xget((src_pkg, previous_version))] - except KeyError: + prev_rows = SourceVersionBuckets.objects.filter(key=src_pkg, key2=previous_version).all() + p_data = [row.column1 for row in prev_rows] + except (KeyError, DoesNotExist): p_data = [] new_buckets = set(n_data).difference(set(p_data)) for bucket in new_buckets: - if isinstance(bucket, str): - bucket = bucket.encode("utf-8") # do not return buckets that failed to retrace if bucket.startswith("failed:"): continue - if isinstance(new_version, str): - new_version = new_version.encode("utf-8") + try: - count = len(bucketversionsystems_cf.get((bucket, new_version), column_count=4)) - except NotFoundException: + count = BucketVersionSystems2.objects.filter(key=bucket, key2=new_version).count() + except DoesNotExist: continue if count <= 2: continue @@ -702,52 +743,47 @@ def get_package_new_buckets(src_pkg, previous_version, new_version): return results -def record_bug_for_bucket(bucketid, bug): - bucketmetadata_cf = pycassa.ColumnFamily(pool, "BucketMetadata") - bugtocrashsignatures_cf = pycassa.ColumnFamily(pool, "BugToCrashSignatures") +def record_bug_for_bucket(bucketid: str, bug: int): # We don't insert bugs into the database if we're using Launchpad staging, # as those will disappear in Launchpad but our copy would persist. - if config.lp_use_staging == "False": - bucketmetadata_cf.insert(bucketid, {"CreatedBug": bug}) - bugtocrashsignatures_cf.insert(int(bug), {bucketid: ""}) + if config.lp_use_staging: + return + BucketMetadata.create(key=bucketid.encode(), column1="CreatedBug", value=str(bug)) + BugToCrashSignatures.create(key=bug, column1=bucketid, value=b"") -def get_signatures_for_bug(bug): +def get_signatures_for_bug(bug: int): try: - bug = int(bug) - except ValueError: - return [] - - bugtocrashsignatures_cf = pycassa.ColumnFamily(pool, "BugToCrashSignatures") - try: - gen = bugtocrashsignatures_cf.xget(bug) - crashes = [crash for crash, unused in gen] + rows = BugToCrashSignatures.objects.filter(key=bug).all() + crashes = [row.column1 for row in rows] return crashes - except NotFoundException: + except DoesNotExist: return [] def bucket_exists(bucketid): - bucket_cf = pycassa.ColumnFamily(pool, "Bucket") try: - bucket_cf.get(bucketid, column_count=1) - return True - except NotFoundException: + count = Bucket.objects.filter(key=bucketid).limit(1).count() + return count > 0 + except DoesNotExist: return False -def get_problem_for_hash(hashed): - hashes_cf = pycassa.ColumnFamily(pool, "Hashes") +def get_problem_for_hash(hashed: str): try: - return hashes_cf.get("bucket_%s" % hashed[0], columns=[hashed])[hashed] - except NotFoundException: + key = ("bucket_%s" % hashed[0]).encode() + rows = Hashes.objects.filter(key=key, column1=hashed.encode()).all() + for row in rows: + return row.value + return None + except DoesNotExist: return None -def get_system_image_versions(image_type): - images_cf = pycassa.ColumnFamily(pool, "SystemImages") +def get_system_image_versions(image_type: str): try: - versions = [version[0] for version in images_cf.xget(image_type)] - return versions - except NotFoundException: + rows = SystemImages.objects.filter(key=image_type).limit(None).all() + versions = set([row.column1 for row in rows]) + return list(versions) + except DoesNotExist: return None diff --git a/src/errors/manage.py b/src/errors/manage.py index fe2b915..8e6c765 100755 --- a/src/errors/manage.py +++ b/src/errors/manage.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # from django.core.management import execute_from_command_line # import imp # try: diff --git a/src/errors/settings.py b/src/errors/settings.py index c7a93c7..3a8aa4d 100644 --- a/src/errors/settings.py +++ b/src/errors/settings.py @@ -1,7 +1,7 @@ # Django settings for errors project. import os -from errortracker import config, cassandra +from errortracker import cassandra, config cassandra.setup_cassandra() diff --git a/src/errors/status.py b/src/errors/status.py index 0338b09..86813dc 100644 --- a/src/errors/status.py +++ b/src/errors/status.py @@ -104,8 +104,7 @@ def check_most_common_problems(): url = "/api/1.0/most-common-problems/?limit=100&format=json" response = c.get(url) data = loads(response.content) - l = len(data["objects"]) - if l == 100: + if len(data["objects"]) == 100: obj = data["objects"][0] if "count" in obj and "function" in obj: return True @@ -115,9 +114,8 @@ def check_most_common_problems(): def check_oops_reports(): today = datetime.date.today().strftime("%Y-%m-%d") try: - l = os.listdir(os.path.join(config.oops_repository, today)) # If we get more than 25 oops reports, alert. - if len(l) > 25: + if len(os.listdir(os.path.join(config.oops_repository, today))) > 25: return False else: return True diff --git a/src/errors/urls.py b/src/errors/urls.py index 28b28dd..cfc327e 100644 --- a/src/errors/urls.py +++ b/src/errors/urls.py @@ -1,8 +1,9 @@ from django.conf import settings from django.conf.urls import include -from django.urls import re_path from django.conf.urls.static import static +from django.urls import re_path from django.views.static import serve + from errors import views urlpatterns = [ diff --git a/src/errors/version.py b/src/errors/version.py index 5fa0a77..f74f8ff 100644 --- a/src/errors/version.py +++ b/src/errors/version.py @@ -1,5 +1,5 @@ version_info = {} try: - from .version_info import version_info -except: + pass +except Exception: pass diff --git a/src/errors/version_middleware.py b/src/errors/version_middleware.py index d02bea2..9689b04 100644 --- a/src/errors/version_middleware.py +++ b/src/errors/version_middleware.py @@ -18,6 +18,7 @@ # along with this program. If not, see . from daisy.version import version_info as daisy_version_info + from errors.version import version_info as errors_version_info diff --git a/src/errors/views.py b/src/errors/views.py index 91e559a..938efd5 100644 --- a/src/errors/views.py +++ b/src/errors/views.py @@ -152,7 +152,7 @@ def status(request): def bug(request, bug): try: bug = int(bug) - except: + except Exception: return HttpResponseRedirect("/") signatures = cassie.get_signatures_for_bug(bug) diff --git a/src/errors/wsgi.py b/src/errors/wsgi.py index 7106ea0..83b85e4 100644 --- a/src/errors/wsgi.py +++ b/src/errors/wsgi.py @@ -1,15 +1,15 @@ import os import oops_dictconfig -from errors import metrics -from errors.version_middleware import VersionMiddleware from oops_wsgi import install_hooks, make_app from oops_wsgi.django import OOPSWSGIHandler from daisy import config +from errors import metrics +from errors.version_middleware import VersionMiddleware os.environ.setdefault("DJANGO_SETTINGS_MODULE", "errors.settings") -import django.core.handlers.wsgi +import django from django.template.loader import render_to_string @@ -17,8 +17,6 @@ def error_renderer(report): return str(render_to_string("500.html", report)) -import django - django.setup() cfg = oops_dictconfig.config_from_dict(config.oops_config) diff --git a/src/errortracker/cassandra_schema.py b/src/errortracker/cassandra_schema.py index 7061f2a..d28d46e 100644 --- a/src/errortracker/cassandra_schema.py +++ b/src/errortracker/cassandra_schema.py @@ -12,8 +12,15 @@ class ErrorTrackerTable(models.Model): class Counters(ErrorTrackerTable): __table_name__ = "Counters" + # the index we count + # - Ubuntu 24.04:zsh:5.9-6ubuntu2 + # - Ubuntu 24.04:zsh key = columns.Blob(db_field="key", primary_key=True) + # a datestamp + # - 20251101 + # - 20240612 column1 = columns.Text(db_field="column1", primary_key=True) + # the count of crashes for that release:package[:version] that day value = columns.Counter(db_field="value") @@ -30,8 +37,9 @@ class Indexes(ErrorTrackerTable): column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Blob(db_field="value") - def get_as_dict(*args, **kwargs) -> dict: - query = Indexes.objects.filter(*args, **kwargs) + @classmethod + def get_as_dict(cls, *args, **kwargs) -> dict: + query = cls.objects.filter(*args, **kwargs) d = {} for result in query: if result.key == b"mean_retracing_time" and not result.column1.endswith("count"): @@ -41,7 +49,7 @@ def get_as_dict(*args, **kwargs) -> dict: else: d[result.column1] = result.value if not d: - raise Indexes.DoesNotExist + raise cls.DoesNotExist return d @@ -54,9 +62,15 @@ class CouldNotBucket(ErrorTrackerTable): class DayOOPS(ErrorTrackerTable): __table_name__ = "DayOOPS" + # a day + # - b'20160809' + # - b'20260116' key = columns.Blob(db_field="key", primary_key=True) + # an OOPS that appeared that day column1 = columns.TimeUUID(db_field="column1", primary_key=True) + # an OOPS that appeared that day value = columns.Blob(db_field="value") + # yes, both column1 and value are the same, just the format is changing class DayUsers(ErrorTrackerTable): @@ -68,8 +82,13 @@ class DayUsers(ErrorTrackerTable): class UserOOPS(ErrorTrackerTable): __table_name__ = "UserOOPS" + # the user ID, aka machine-id + # - b'' key = columns.Blob(db_field="key", primary_key=True) + # an OOPS reported by that machine + # - column1 = columns.Text(db_field="column1", primary_key=True) + # appears to be unused value = columns.Blob(db_field="value") @@ -79,11 +98,12 @@ class OOPS(ErrorTrackerTable): column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Text(db_field="value") - def get_as_dict(*args, **kwargs) -> dict: - query = OOPS.objects.filter(*args, **kwargs) + @classmethod + def get_as_dict(cls, *args, **kwargs) -> dict: + query = cls.objects.filter(*args, **kwargs) d = {} for result in query: - d[result["column1"]] = result["value"] + d[result.column1] = result.value return d @@ -103,15 +123,28 @@ class SystemOOPSHashes(ErrorTrackerTable): class BucketMetadata(ErrorTrackerTable): __table_name__ = "BucketMetadata" + # the bucket ID + # - b'/bin/zsh:11:makezleparams:execzlefunc:redrawhook:zlecore:zleread' key = columns.Blob(db_field="key", primary_key=True) + # Which metadata + # - FirstSeen (package version) + # - LastSeen (package version) + # - FirstSeenRelease (Ubuntu series) + # - ~Ubuntu 25.04:LastSeen (package version) + # - CreatedBug column1 = columns.Text(db_field="column1", primary_key=True) + # The corresponding value for the metadata + # - 5.9-6ubuntu2 (package version) + # - Ubuntu 18.04 (Ubuntu series) + # - value = columns.Text(db_field="value") - def get_as_dict(*args, **kwargs) -> dict: - query = BucketMetadata.objects.filter(*args, **kwargs) + @classmethod + def get_as_dict(cls, *args, **kwargs) -> dict: + query = cls.objects.filter(*args, **kwargs) d = {} for result in query: - d[result["column1"]] = result["value"] + d[result.column1] = result.value return d @@ -128,11 +161,12 @@ class RetraceStats(ErrorTrackerTable): column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Counter(db_field="value") - def get_as_dict(*args, **kwargs) -> dict: - query = RetraceStats.objects.filter(*args, **kwargs) + @classmethod + def get_as_dict(cls, *args, **kwargs) -> dict: + query = cls.objects.filter(*args, **kwargs) d = {} for result in query: - d[result["column1"]] = result["value"] + d[result.column1] = result.value return d @@ -145,16 +179,34 @@ class Bucket(ErrorTrackerTable): class DayBuckets(ErrorTrackerTable): __table_name__ = "DayBuckets" + # a day + # - 20160809 + # - 20260116 key = columns.Text(db_field="key", primary_key=True) + # the bucketid: + # - /bin/zsh:11:__GI__IO_flush_all:_IO_cleanup:__run_exit_handlers:__GI_exit:zexit + # - /bin/brltty:*** buffer overflow detected ***: terminated key2 = columns.Text(db_field="key2", primary_key=True) + # an OOPS id: + # - column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Blob(db_field="value") class DayBucketsCount(ErrorTrackerTable): __table_name__ = "DayBucketsCount" + # the index we count + # - 20251201 + # - Ubuntu 24.04:20251201 + # - zsh:amd64:20251201 + # - Crash:zsh:amd64:20251201 (No idea about the difference with the previous example) + # - package:tvtime:(not installed)\nSetting up tvtime (1.0.11-8build2) ...\ndpkg: error processing package tvtime (--configure):\n installed tvtime package post-installation script subprocess returned error exit status 1\n key = columns.Blob(db_field="key", primary_key=True) + # The bucketid we count: + # - /bin/zsh:11:__GI__IO_flush_all:_IO_cleanup:__run_exit_handlers:__GI_exit:zexit + # - /bin/brltty:*** buffer overflow detected ***: terminated column1 = columns.Text(db_field="column1", primary_key=True) + # the counter itself value = columns.Counter(db_field="value") @@ -180,11 +232,12 @@ class BucketRetraceFailureReason(ErrorTrackerTable): column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Text(db_field="value") - def get_as_dict(*args, **kwargs) -> dict: - query = BucketRetraceFailureReason.objects.filter(*args, **kwargs) + @classmethod + def get_as_dict(cls, *args, **kwargs) -> dict: + query = cls.objects.filter(*args, **kwargs) d = {} for result in query: - d[result["column1"]] = result["value"] + d[result.column1] = result.value return d @@ -193,3 +246,78 @@ class AwaitingRetrace(ErrorTrackerTable): key = columns.Text(db_field="key", primary_key=True) column1 = columns.Text(db_field="column1", primary_key=True) value = columns.Text(db_field="value") + + +class ErrorsByRelease(ErrorTrackerTable): + __table_name__ = "ErrorsByRelease" + # The release: + # - Ubuntu 25.04 + key = columns.Ascii(db_field="key", primary_key=True) + # The datetime when we received the OOPS + key2 = columns.DateTime(db_field="key2", primary_key=True) + # The OOPS id + column1 = columns.TimeUUID(db_field="column1", primary_key=True) + # The datetime when we received the OOPS (again???) + value = columns.DateTime(db_field="value") + + +class BucketVersionsCount(ErrorTrackerTable): + __table_name__ = "BucketVersionsCount" + key = columns.Text(db_field="key", primary_key=True) + column1 = columns.Ascii(db_field="column1", primary_key=True) + column2 = columns.Ascii(db_field="column2", primary_key=True) + value = columns.Counter(db_field="value") + + +class BugToCrashSignatures(ErrorTrackerTable): + __table_name__ = "BugToCrashSignatures" + # The bug number + key = columns.VarInt(db_field="key", primary_key=True) + # The crash signature: + # - /usr/lib/gnome-do/Do.exe:8:g_hash_table_lookup:mono_find_jit_icall_by_addr:mono_emit_jit_icall:mono_method_to_ir:mini_method_compile + column1 = columns.Text(db_field="column1", primary_key=True) + # appears to be usused + value = columns.Blob(db_field="value") + + +class SystemImages(ErrorTrackerTable): + # Very likely useless nowadays, doesn't have much up to date data + __table_name__ = "SystemImages" + # One of those: + # - device_image + # - rootfs_build + # - channel + # - device_name + key = columns.Text(db_field="key", primary_key=True) + # The version of the image type: + # - 16.04/community/walid/devel 101 titan + # - ubuntu-touch/vivid-proposed-customized-here 99 mako + column1 = columns.Text(db_field="column1", primary_key=True) + # Looks empty and unused + value = columns.Blob(db_field="value") + + +class UniqueUsers90Days(ErrorTrackerTable): + __table_name__ = "UniqueUsers90Days" + # Ubuntu series ("Ubuntu 26.04", "Ubuntu 25.10", etc...) + key = columns.Text(db_field="key", primary_key=True) + # a datestamp ("20251101", "20240612", etc...) + column1 = columns.Text(db_field="column1", primary_key=True) + # the count of unique users of that release that day + value = columns.BigInt(db_field="value") + + +class UserBinaryPackages(ErrorTrackerTable): + __table_name__ = "UserBinaryPackages" + # a team that usually owns packages (like for MIR) + # - debcrafters-packages + # - foundations-bugs + # - xubuntu-bugs + key = columns.Ascii(db_field="key", primary_key=True) + # package names + # - abiword + # - util-linux + # looks to be binary packages only, but not 100% certain + column1 = columns.Ascii(db_field="column1", primary_key=True) + # looks unused + value = columns.Blob(db_field="value") diff --git a/src/errortracker/oopses.py b/src/errortracker/oopses.py index 37880cf..1459def 100644 --- a/src/errortracker/oopses.py +++ b/src/errortracker/oopses.py @@ -7,15 +7,16 @@ """basic operations on oopses in the db.""" import json +import locale import re import time import uuid +from datetime import datetime from hashlib import md5, sha1 from cassandra.cqlengine.query import BatchQuery from errortracker import cassandra_schema -from errortracker.cassandra import cassandra_session DAY = 60 * 60 * 24 MONTH = DAY * 30 @@ -100,7 +101,15 @@ def _insert( :param ttl: boolean for setting the time to live for the column :return: The day which the oops was filed under. """ - day_key = time.strftime("%Y%m%d", time.gmtime()) + try: + # Make sure the datetime will get formatted "correctly" in that cursed time format: Mon May 5 14:46:10 2025 + locale.setlocale(locale.LC_ALL, "C.UTF-8") + # Try to get the actual day of that crash, otherwise fallback to today + crash_datetime = datetime.strptime(insert_dict["Date"], "%c") + day_key = crash_datetime.strftime("%Y%m%d") + except Exception: + crash_datetime = datetime.now() + day_key = datetime.strftime(datetime.now(), "%Y%m%d") now_uuid = uuid.uuid1() if ttl: @@ -117,6 +126,13 @@ def _insert( automated_testing = True cassandra_schema.DayOOPS.create(key=day_key.encode(), column1=now_uuid, value=oopsid.encode()) + if "DistroRelease" in insert_dict: + cassandra_schema.ErrorsByRelease.create( + key=insert_dict["DistroRelease"], + key2=datetime.now(), + column1=now_uuid, + value=crash_datetime, + ) # Systems running automated tests should not be included in the OOPS count. if not automated_testing: @@ -129,12 +145,12 @@ def _insert( cassandra_schema.Counters.filter( key=f"oopses:{field}".encode(), column1=day_key ).update(value=1) - if proposed_pkg: - for field in fields: - field = field.encode("ascii", errors="replace").decode() - cassandra_schema.CountersForProposed.filter( - key=f"oopses:{field}".encode(), column1=day_key - ).update(value=1) + if proposed_pkg: + for field in fields: + field = field.encode("ascii", errors="replace").decode() + cassandra_schema.CountersForProposed.filter( + key=f"oopses:{field}".encode(), column1=day_key + ).update(value=1) if user_token: cassandra_schema.UserOOPS.create(key=user_token.encode(), column1=oopsid, value=b"") @@ -170,20 +186,16 @@ def bucket(oopsid, bucketid, fields=None, proposed_fields=False): :return: The day which the bucket was filed under. """ - session = cassandra_session() - # Get the timestamp. try: - results = session.execute( - session.prepare( - f'SELECT WRITETIME (value) FROM {session.keyspace}."OOPS" WHERE key = ? LIMIT 1' - ), - [oopsid.encode()], - ) - timestamp = list(results)[0]["writetime(value)"] - day_key = time.strftime("%Y%m%d", time.gmtime(timestamp / 1000000)) - except IndexError: - # Eventual consistency. This OOPS probably occurred today. - day_key = time.strftime("%Y%m%d", time.gmtime()) + # Make sure the datetime will get formatted "correctly" in that cursed time format: Mon May 5 14:46:10 2025 + locale.setlocale(locale.LC_ALL, "C.UTF-8") + row = cassandra_schema.OOPS.objects.get(key=oopsid.encode(), column1="Date") + # Try to get the actual day of that crash, otherwise fallback to today + crash_datetime = datetime.strptime(row.value, "%c") + day_key = crash_datetime.strftime("%Y%m%d") + except Exception: + crash_datetime = datetime.now() + day_key = datetime.strftime(datetime.now(), "%Y%m%d") cassandra_schema.Bucket.create(key=bucketid, column1=uuid.UUID(oopsid), value=b"") cassandra_schema.DayBuckets.create(key=day_key, key2=bucketid, column1=oopsid, value=b"") @@ -211,6 +223,12 @@ def bucket(oopsid, bucketid, fields=None, proposed_fields=False): return day_key +def update_bucket_versions_count(crash_signature: str, release: str, version: str): + cassandra_schema.BucketVersionsCount( + key=crash_signature, column1=release, column2=version + ).update(value=1) + + def update_bucket_metadata(bucketid, source, version, comparator, release=""): # We only update the first and last seen version fields. We do not update # the current version field as talking to Launchpad is an expensive diff --git a/src/errortracker/utils.py b/src/errortracker/utils.py index 426fe92..987dc4a 100644 --- a/src/errortracker/utils.py +++ b/src/errortracker/utils.py @@ -158,6 +158,7 @@ def bucket(oops_id, crash_signature, report_dict): apt.apt_pkg.version_compare, release, ) + oopses.update_bucket_versions_count(crash_signature, release, version) oopses.update_source_version_buckets(src_package, version, crash_signature) diff --git a/src/retracer.py b/src/retracer.py index a55cd6a..40dd128 100755 --- a/src/retracer.py +++ b/src/retracer.py @@ -1147,7 +1147,7 @@ def requeue(self, msg, oops_id): ts = msg.properties.get("timestamp") # If we are still unable to find the OOPS after 8 days then # just process it as a failure. - today = datetime.datetime.now(datetime.UTC) + today = datetime.datetime.now(datetime.timezone.utc) target_date = today - datetime.timedelta(days=8) # if we don't know how old it is it must be ancient if not ts: diff --git a/src/tests/conftest.py b/src/tests/conftest.py index c4a198c..f9e2705 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -8,6 +8,7 @@ import shutil import tempfile +from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -16,9 +17,10 @@ import retracer as et_retracer from errortracker import cassandra +from tests.create_test_data import create_test_data -@pytest.fixture(scope="function") +@pytest.fixture(scope="class") def temporary_db(): cassandra.KEYSPACE = "tmp" cassandra.REPLICATION_FACTOR = 1 @@ -27,7 +29,7 @@ def temporary_db(): management.drop_keyspace(cassandra.KEYSPACE) -@pytest.fixture(scope="function") +@pytest.fixture(scope="class") def retracer(temporary_db): temp = Path(tempfile.mkdtemp()) config_dir = temp / "config" @@ -45,3 +47,14 @@ def retracer(temporary_db): architecture=architecture, ) shutil.rmtree(temp) + + +@pytest.fixture(scope="module") +def datetime_now(): + return datetime.now() + + +@pytest.fixture(scope="class") +def cassandra_data(datetime_now, temporary_db): + create_test_data(datetime_now) + yield diff --git a/src/tests/create_test_data.py b/src/tests/create_test_data.py new file mode 100644 index 0000000..724e898 --- /dev/null +++ b/src/tests/create_test_data.py @@ -0,0 +1,231 @@ +import locale +import logging +import uuid +from datetime import datetime, timedelta + +import bson +from apport import Report + +from daisy.submit import submit +from errortracker import cassandra_schema as schema +from errortracker import utils + + +def create_test_data(datetime_now=datetime.now()): + # disable daisy logger temporarily + daisy_logger = logging.getLogger("daisy") + daisy_logger_level = daisy_logger.level + daisy_logger.setLevel(51) # CRITICAL is 50, so let's go higher + + # Make sure the datetime will get formatted "correctly" in that cursed time format: Mon May 5 14:46:10 2025 + locale.setlocale(locale.LC_ALL, "C.UTF-8") + + def new_oops(days_ago, data, systemid="imatestsystem"): + crash_date = datetime_now - timedelta(days=days_ago) + oops_date = crash_date.strftime("%c") + data.update({"Date": oops_date}) + bson_data = bson.encode(data) + request = type( + "Request", + (object,), + dict(data=bson_data, headers={"X-Whoopsie-Version": "0.2.81ubuntu~fakefortesting"}), + ) + submit(request, systemid) + + # Get a wide screen, because here we'll want to have compact data, meaning long lines 🙃 + # fmt: off + + # increase-rate package version 1 + for i in [30, 20, 10, 5, 2]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "increase-rate 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/increase-rate", "StacktraceAddressSignature": "/usr/bin/increase-rate:42:/usr/bin/increase-rate+28"}) + + # increase-rate package version 2 + for i in [2, 2, 1, 1, 1, 0, 0, 0, 0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "increase-rate 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/increase-rate", "StacktraceAddressSignature": "/usr/bin/increase-rate:42:/usr/bin/increase-rate+fa0"}) + + # increase-rate package version 2 in proposed, even more crashes! + for i in [1, 0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "increase-rate 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/increase-rate", "StacktraceAddressSignature": "/usr/bin/increase-rate:42:/usr/bin/increase-rate+fa0", "Tags": "package-from-proposed"}) + + # no-crashes-today package version 1 (old version with crashes) + for i in [30, 20, 10, 5, 2]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "no-crashes-today 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/no-crashes-today", "StacktraceAddressSignature": "/usr/bin/no-crashes-today:1:/usr/bin/no-crashes-today+10"}) + + # no-crashes-today package version 2 (no crashes today - last crash was yesterday) + for i in [5, 3, 1]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "no-crashes-today 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/no-crashes-today", "StacktraceAddressSignature": "/usr/bin/no-crashes-today:2:/usr/bin/no-crashes-today+20"}) + + # few-crashes package version 1 (old version with crashes) + for i in [30, 20, 10, 5, 2]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "few-crashes 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/few-crashes", "StacktraceAddressSignature": "/usr/bin/few-crashes:1:/usr/bin/few-crashes+10"}) + + # few-crashes package version 2 (only 2 crashes today - less than threshold of 3) + for i in [0, 0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "few-crashes 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/few-crashes", "StacktraceAddressSignature": "/usr/bin/few-crashes:2:/usr/bin/few-crashes+20"}) + + # new-package (no old version - should always be increase=True) + for i in [0, 0, 0, 0, 0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "new-package 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/new-package", "StacktraceAddressSignature": "/usr/bin/new-package:1:/usr/bin/new-package+10"}) + + # low-difference package version 1 (old version with consistent crashes) + for i in [30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "low-difference 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/low-difference", "StacktraceAddressSignature": "/usr/bin/low-difference:1:/usr/bin/low-difference+10"}) + + # low-difference package version 2 (similar crash rate to version 1, so difference should be low) + # Only 1 crash today which is less than the expected average + for i in [0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "low-difference 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/low-difference", "StacktraceAddressSignature": "/usr/bin/low-difference:2:/usr/bin/low-difference+20"}) + + # all-proposed package version 1 + for i in [30, 20, 10]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "all-proposed 1", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/all-proposed", "StacktraceAddressSignature": "/usr/bin/all-proposed:1:/usr/bin/all-proposed+10"}) + + # all-proposed package version 2 (all crashes today are from proposed) + for i in [0, 0, 0, 0]: + new_oops(i, {"DistroRelease": "Ubuntu 24.04", "Package": "all-proposed 2", "ProblemType": "Crash", "Architecture": "amd64", "ExecutablePath": "/usr/bin/all-proposed", "StacktraceAddressSignature": "/usr/bin/all-proposed:2:/usr/bin/all-proposed+20", "Tags": "package-from-proposed"}) + # fmt: on + + # a retraced and bucketed report + report = Report() + report["DistroRelease"] = "Ubuntu 24.04" + report["Package"] = "already-bucketed 1.0" + report["SourcePackage"] = "already-bucketed-src" + report["ExecutablePath"] = "/usr/bin/already-bucketed" + report["Signal"] = "11" + report["StacktraceTop"] = "func1 () at already-bucketed.c:42\nmain () at already-bucketed.c:14" + report["StacktraceAddressSignature"] = ( + "/usr/bin/already-bucketed:42:/usr/bin/already-bucketed+28" + ) + report["Stacktrace"] = ( + "#0 0x40004000 in func1 () at ./already-bucketed.c:42\n" + "#1 0x40005000 in main () at ./already-bucketed.c:14\n" + ) + report["ThreadStacktrace"] = ( + ".\nThread 1 (Thread 0x42424242 (LWP 4000)):\n" + "#0 0x40004000 in func1 () at ./already-bucketed.c:42\n" + "#1 0x40005000 in main () at ./already-bucketed.c:14\n" + ) + utils.bucket(str(uuid.uuid1()), report.crash_signature(), report) + # emulate the retracer + schema.Indexes.objects.create( + key=b"crash_signature_for_stacktrace_address_signature", + column1=report["StacktraceAddressSignature"], + value=report.crash_signature().encode(), + ) + schema.Stacktrace.objects.create( + key=report["StacktraceAddressSignature"].encode(), + column1="Stacktrace", + value=report["Stacktrace"], + ) + schema.Stacktrace.objects.create( + key=report["StacktraceAddressSignature"].encode(), + column1="ThreadStacktrace", + value=report["ThreadStacktrace"], + ) + + # another similar crash + new_oops( + 0, + { + "DistroRelease": "Ubuntu 26.04", + "Architecture": "amd64", + "Package": "already-bucketed 2.0", + "SourcePackage": "already-bucketed-src", + "ProblemType": "Crash", + "ExecutablePath": "/usr/bin/already-bucketed", + "StacktraceAddressSignature": report["StacktraceAddressSignature"], + "StacktraceTop": report["StacktraceTop"], + "Signal": report["Signal"], + }, + ) + + # a failed retrace report + failed_report = Report() + failed_report["DistroRelease"] = "Ubuntu 24.04" + failed_report["Package"] = "failed-retrace 1.0" + failed_report["SourcePackage"] = "failed-retrace-src" + failed_report["ExecutablePath"] = "/usr/bin/failed-retrace" + failed_report["Signal"] = "11" + failed_report["StacktraceTop"] = "failed_func () at failed.c:10\nmain () at failed.c:5" + failed_report["StacktraceAddressSignature"] = ( + "/usr/bin/failed-retrace:11:/usr/bin/failed-retrace+100" + ) + utils.bucket(str(uuid.uuid1()), failed_report.crash_signature(), failed_report) + # emulate a failed retrace with failure reasons + schema.BucketRetraceFailureReason.objects.create( + key=failed_report.crash_signature().encode(), + column1="missing-debug-symbols", + value="Debug symbols not available for package failed-retrace", + ) + schema.BucketRetraceFailureReason.objects.create( + key=failed_report.crash_signature().encode(), + column1="retrace-error", + value="Failed to generate stacktrace", + ) + + # a Python crash + python_report = Report() + python_report["DistroRelease"] = "Ubuntu 24.04" + python_report["Package"] = "python3-traceback 1.0" + python_report["SourcePackage"] = "python-traceback" + python_report["ExecutablePath"] = "/usr/bin/pytraceback" + python_report["Traceback"] = ( + "Traceback (most recent call last):\n" + ' File "/usr/bin/pytraceback", line 42, in func1\n' + " raise Exception('Test error')\n" + "Exception: Test error" + ) + new_oops(30, python_report) + new_oops(8, python_report) + new_oops(0, python_report) + + # This new crash is definitely bad, happening everywhere! + python_report["DistroRelease"] = "Ubuntu 24.04" + python_report["Package"] = "python3-traceback 1.1" + python_report["Traceback"] = ( + "Traceback (most recent call last):\n" + ' File "/usr/bin/pytraceback", line 84, in func2\n' + " raise RuntimeError('A very different traceback')\n" + "RuntimeError: A very different traceback" + ) + new_oops(2, python_report, systemid="testsystem1") + new_oops(1, python_report, systemid="testsystem2") + new_oops(0, python_report, systemid="testsystem3") + + # Even newer crash, less bad this time + python_report["Package"] = "python3-traceback 1.2" + python_report["Traceback"] = ( + "Traceback (most recent call last):\n" + ' File "/usr/bin/pytraceback", line 94, in func3\n' + " raise MemoryError('No more memory available, too bad')\n" + "MemoryError: No more memory available, too bad" + ) + new_oops(1, python_report) + + schema.SystemImages.objects.create( + key="device_image", column1="ubuntu-touch/devel-proposed 227 hammerhead", value=b"" + ) + + schema.UserBinaryPackages.objects.create(key="foundations-bugs", column1="adduser") + schema.UserBinaryPackages.objects.create(key="foundations-bugs", column1="apt") + schema.UserBinaryPackages.objects.create(key="foundations-bugs", column1="util-linux") + schema.UserBinaryPackages.objects.create(key="xubuntu-bugs", column1="abiword") + schema.UserBinaryPackages.objects.create(key="daisy-pluckers", column1="failed-retrace") + schema.UserBinaryPackages.objects.create(key="daisy-pluckers", column1="already-bucketed") + schema.UserBinaryPackages.objects.create(key="daisy-pluckers", column1="never-crashed") + + # XXX Hack to populate UniqueUsers90Days + # keep the import here, to avoid a new cassandra setup with the wrong keyspace in the tests + from tools import unique_users_daily_update + + unique_users_daily_update.main() + + # re-enable daisy logger + daisy_logger.setLevel(daisy_logger_level) + + +if __name__ == "__main__": + from errortracker import cassandra + + cassandra.setup_cassandra() + create_test_data() diff --git a/src/tests/test_cassie.py b/src/tests/test_cassie.py new file mode 100644 index 0000000..34d4c12 --- /dev/null +++ b/src/tests/test_cassie.py @@ -0,0 +1,556 @@ +from datetime import datetime, timedelta +from uuid import UUID + +import distro_info +import numpy +from pytest import approx + +from errors import cassie + + +class TestCassie: + def test_get_package_crash_rate_increase_rate(self, datetime_now, cassandra_data): + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "increase-rate", + "1", + "2", + "70", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + ) + assert crash_rate == approx( + { + "increase": True, + "difference": numpy.float64(4.3), + "web_link": "https://errors.internal/?release=Ubuntu%2024.04&package=increase-rate&version=2", + "previous_period_in_days": 30, + "previous_average": numpy.float64(0.7), + }, + rel=1e-1, # We don't want much precision, Cassandra is already messing up the values + ) + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "increase-rate", + "1", + "2", + "70", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + True, + ) + assert crash_rate == approx( + { + "increase": True, + "difference": numpy.float64(3.4), + "web_link": "https://errors.internal/?release=Ubuntu%2024.04&package=increase-rate&version=2", + "previous_period_in_days": 30, + "previous_average": numpy.float64(0.7), + }, + rel=1e-1, # We don't want much precision, Cassandra is already messing up the values + ) + + def test_get_package_crash_rate_no_crashes_today(self, datetime_now, cassandra_data): + """Test case where new version has no crashes today - should return increase=False""" + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "no-crashes-today", + "1", + "2", + "100", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + ) + assert crash_rate == {"increase": False} + + def test_get_package_crash_rate_few_crashes(self, datetime_now, cassandra_data): + """Test case where new version has only 2 crashes today (less than threshold of 3) - should return increase=False""" + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "few-crashes", + "1", + "2", + "100", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + ) + assert crash_rate == {"increase": False} + + def test_get_package_crash_rate_new_package(self, datetime_now, cassandra_data): + """Test case where there's no old version data - should return increase=True with difference=today_crashes""" + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "new-package", + "0", # Old version that doesn't exist + "1", + "100", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + ) + assert crash_rate == approx( + { + "increase": True, + "difference": 5, # Should equal the number of crashes today + "web_link": "https://errors.internal/?release=Ubuntu%2024.04&package=new-package&version=1", + "previous_average": None, + }, + rel=1e-1, + ) + + def test_get_package_crash_rate_low_difference(self, datetime_now, cassandra_data): + """Test case where crash rate is similar between versions (difference <= 1) - should return increase=False""" + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "low-difference", + "1", + "2", + "100", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + ) + assert crash_rate == {"increase": False} + + def test_get_package_crash_rate_all_proposed(self, datetime_now, cassandra_data): + """Test case where all today's crashes are from proposed and we exclude proposed - should return increase=False""" + now = datetime_now + + crash_rate = cassie.get_package_crash_rate( + "Ubuntu 24.04", + "all-proposed", + "1", + "2", + "100", + (now - timedelta(days=0)).strftime("%Y%m%d"), + "https://errors.internal/", + exclude_proposed=True, + ) + assert crash_rate == {"increase": False} + + def test_bucket_exists_true(self, cassandra_data): + """Test bucket_exists returns True for existing bucket""" + assert cassie.bucket_exists("/usr/bin/already-bucketed:11:func1:main") is True + + def test_bucket_exists_false(self, cassandra_data): + """Test bucket_exists returns False for non-existing bucket""" + # Use a non-existent bucket ID + assert cassie.bucket_exists("nonexistent_bucket_12345") is False + + def test_get_crashes_for_bucket(self, cassandra_data): + """Test get_crashes_for_bucket returns list of crash UUIDs""" + # Use known bucket from test data + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + crashes = cassie.get_crashes_for_bucket(bucket_id, limit=10) + assert isinstance(crashes, list) + # Should have two crashes from the test data + assert len(crashes) == 2 + + for crash in crashes: + assert isinstance(crash, UUID) + + def test_get_crashes_for_bucket_nonexistent(self, cassandra_data): + """Test get_crashes_for_bucket returns empty list for non-existent bucket""" + crashes = cassie.get_crashes_for_bucket("nonexistent_bucket_12345") + assert crashes == [] + + def test_get_metadata_for_bucket(self, cassandra_data): + """Test get_metadata_for_bucket returns metadata dictionary""" + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + metadata = cassie.get_metadata_for_bucket(bucket_id) + assert isinstance(metadata, dict) + assert metadata["Source"] == "already-bucketed" + assert metadata["FirstSeen"] == "1.0" + assert metadata["LastSeen"] == "2.0" + assert metadata["FirstSeenRelease"] == "Ubuntu 24.04" + assert metadata["LastSeenRelease"] == "Ubuntu 26.04" + + def test_get_metadata_for_bucket_nonexistent(self, cassandra_data): + """Test get_metadata_for_bucket returns empty dict for non-existent bucket""" + metadata = cassie.get_metadata_for_bucket("nonexistent_bucket_12345") + assert metadata == {} + + def test_get_versions_for_bucket(self, cassandra_data): + """Test get_versions_for_bucket returns version counts dictionary""" + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + versions = cassie.get_versions_for_bucket(bucket_id) + assert isinstance(versions, dict) + assert versions["Ubuntu 24.04"] == "1.0" + assert versions["Ubuntu 26.04"] == "2.0" + + def test_get_versions_for_bucket_nonexistent(self, cassandra_data): + """Test get_versions_for_bucket returns empty dict for non-existent bucket""" + versions = cassie.get_versions_for_bucket("nonexistent_bucket_12345") + assert versions == {} + + def test_record_bug_for_bucket_and_get_signatures(self, cassandra_data): + """Test record_bug_for_bucket records a bug and get_signatures_for_bug retrieves it""" + from unittest.mock import patch + + from errortracker import config + + bucket_id = "/usr/bin/test-bucket:42:func:main" + bug_number = 100123 + + # Temporarily disable staging mode to test the actual functionality + with patch.object(config, "lp_use_staging", False): + # Record a bug for a bucket + cassie.record_bug_for_bucket(bucket_id, bug_number) + + # Retrieve signatures for that bug + signatures = cassie.get_signatures_for_bug(bug_number) + assert isinstance(signatures, list) + assert signatures == [bucket_id] + + def test_get_signatures_for_bug_nonexistent(self, cassandra_data): + """Test get_signatures_for_bug returns empty list for non-existent bug""" + signatures = cassie.get_signatures_for_bug(888888) + assert signatures == [] + + def test_get_crash(self, cassandra_data): + """Test get_crash returns crash data dictionary""" + # Get a crash UUID from the test data + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + crashes = cassie.get_crashes_for_bucket(bucket_id, limit=1) + crash_data = cassie.get_crash(str(crashes[0])) + assert isinstance(crash_data, dict) + assert crash_data["ExecutablePath"] == "/usr/bin/already-bucketed" + assert crash_data["SourcePackage"] == "already-bucketed-src" + + def test_get_crash_nonexistent(self, cassandra_data): + """Test get_crash returns empty dict for non-existent crash""" + crash_data = cassie.get_crash("not-a-uuid") + assert crash_data == {} + + def test_get_package_for_bucket(self, cassandra_data): + """Test get_package_for_bucket returns package name and version""" + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + package, version = cassie.get_package_for_bucket(bucket_id) + assert package == "already-bucketed" + assert version == "2.0" + + def test_get_package_for_bucket_nonexistent(self, cassandra_data): + """Test get_package_for_bucket returns empty strings for non-existent bucket""" + package, version = cassie.get_package_for_bucket("nonexistent_bucket_12345") + assert package == "" + assert version == "" + + def test_get_problem_for_hash(self, cassandra_data): + """Test get_problem_for_hash returns problem signature for hash""" + # Test with a hash that exists + result = cassie.get_problem_for_hash("6f2c361a80d2e8afd62563539e9618569e387b48") + assert result == "/usr/bin/already-bucketed:11:func1:main" + + def test_get_problem_for_hash_nonexistent(self, cassandra_data): + """Test get_problem_for_hash returns None for non-existent hash""" + result = cassie.get_problem_for_hash("nonexistent_hash_xyz") + assert result is None + + def test_get_system_image_versions(self, cassandra_data): + """Test get_system_image_versions returns list of versions""" + # Test with a common image type + versions = cassie.get_system_image_versions("device_image") + assert versions == ["ubuntu-touch/devel-proposed 227 hammerhead"] + + def test_get_source_package_for_bucket(self, cassandra_data): + """Test get_source_package_for_bucket returns source package name""" + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + source_package = cassie.get_source_package_for_bucket(bucket_id) + assert source_package == "already-bucketed-src" + + def test_get_source_package_for_bucket_nonexistent(self, cassandra_data): + """Test get_source_package_for_bucket returns empty string for non-existent bucket""" + source_package = cassie.get_source_package_for_bucket("nonexistent_bucket_12345") + assert source_package == "" + + def test_get_traceback_for_bucket(self, cassandra_data): + """Test get_traceback_for_bucket returns traceback data""" + bucket_id = "/usr/bin/pytraceback:Exception:func1" + traceback = cassie.get_traceback_for_bucket(bucket_id) + assert "Traceback (most recent call last)" in traceback + assert "/usr/bin/pytraceback" in traceback + assert "Test error" in traceback + + def test_get_traceback_for_bucket_nonexistent(self, cassandra_data): + """Test get_traceback_for_bucket returns None for non-existent bucket""" + traceback = cassie.get_traceback_for_bucket("nonexistent_bucket_12345") + assert traceback is None + + def test_get_stacktrace_for_bucket(self, cassandra_data): + """Test get_stacktrace_for_bucket returns stacktrace data""" + bucket_id = "/usr/bin/already-bucketed:11:func1:main" + result = cassie.get_stacktrace_for_bucket(bucket_id) + # Should return tuple of (Stacktrace, ThreadStacktrace) + assert result is not None + assert isinstance(result, tuple) + assert len(result) == 2 + stacktrace, thread_stacktrace = result + # Check specific values in stacktrace + assert "func1" in stacktrace + assert "main" in stacktrace + # Check specific values in thread_stacktrace + assert "Thread 1" in thread_stacktrace + assert "0x42424242" in thread_stacktrace + assert "func1 ()" in thread_stacktrace + assert "already-bucketed.c:42" in thread_stacktrace + + def test_get_stacktrace_for_bucket_nonexistent(self, cassandra_data): + """Test get_stacktrace_for_bucket returns (None, None) for non-existent bucket""" + result = cassie.get_stacktrace_for_bucket("nonexistent_bucket_12345") + assert result == (None, None) + + def test_get_retrace_failure_for_bucket(self, cassandra_data): + """Test get_retrace_failure_for_bucket returns failure data""" + bucket_id = "/usr/bin/failed-retrace:11:failed_func:main" + result = cassie.get_retrace_failure_for_bucket(bucket_id) + # Should return dict with failure reasons + assert isinstance(result, dict) + assert len(result) > 0 + assert "missing-debug-symbols" in result + assert "Debug symbols not available" in result["missing-debug-symbols"] + assert "retrace-error" in result + assert "Failed to generate stacktrace" in result["retrace-error"] + + def test_get_retrace_failure_for_bucket_nonexistent(self, cassandra_data): + """Test get_retrace_failure_for_bucket returns empty dict for non-existent bucket""" + result = cassie.get_retrace_failure_for_bucket("nonexistent_bucket_12345") + assert result == {} + + def test_get_metadata_for_buckets(self, cassandra_data): + """Test get_metadata_for_buckets returns metadata for multiple buckets""" + bucket_ids = [ + "/usr/bin/already-bucketed:11:func1:main", + "/usr/bin/failed-retrace:11:failed_func:main", + ] + metadata = cassie.get_metadata_for_buckets(bucket_ids) + assert isinstance(metadata, dict) + assert len(metadata) == 2 + assert metadata["/usr/bin/already-bucketed:11:func1:main"]["Source"] == "already-bucketed" + assert ( + metadata["/usr/bin/failed-retrace:11:failed_func:main"]["Source"] == "failed-retrace" + ) + + def test_get_metadata_for_buckets_empty(self, cassandra_data): + """Test get_metadata_for_buckets returns empty dict for empty list""" + metadata = cassie.get_metadata_for_buckets([]) + assert metadata == {} + + def test_get_user_crashes(self, cassandra_data): + """Test get_user_crashes returns list of crash UUIDs for a user""" + # Using the test system ID from create_test_data + user_token = "imatestsystem" + crashes = cassie.get_user_crashes(user_token, limit=5) + assert isinstance(crashes, list) + assert len(crashes) == 5 + for uuid_str, crash_time in crashes: + assert isinstance(uuid_str, str) + assert isinstance(crash_time, datetime) + first_crash = crashes[0] + more_crashes = cassie.get_user_crashes(user_token, limit=5, start=first_crash[0]) + assert len(crashes) == 5 + assert crashes[1] == more_crashes[0] + assert crashes[2] == more_crashes[1] + assert more_crashes[-1] not in crashes + + def test_get_user_crashes_nonexistent(self, cassandra_data): + """Test get_user_crashes returns empty list for non-existent user""" + crashes = cassie.get_user_crashes("nonexistent_user_12345") + assert crashes == [] + + def test_get_binary_packages_for_user(self, cassandra_data): + """Test get_binary_packages_for_user returns list of packages or None""" + packages = cassie.get_binary_packages_for_user("daisy-pluckers") + assert packages == ["already-bucketed", "failed-retrace"] + + def test_get_binary_packages_for_user_no_crash(self, cassandra_data): + """Test get_binary_packages_for_user returns None when user has no binary packages""" + packages = cassie.get_binary_packages_for_user("foundations-bugs") + assert packages == [] + + def test_get_binary_packages_for_user_non_existing_user(self, cassandra_data): + """Test get_binary_packages_for_user returns None when user has no binary packages""" + packages = cassie.get_binary_packages_for_user("nonexistent_user_12345") + assert packages is None + + def test_get_package_new_buckets(self, cassandra_data): + """Test get_package_new_buckets returns list of new crash buckets""" + buckets = cassie.get_package_new_buckets("python-traceback", "1.0", "1.1") + assert buckets == ["/usr/bin/pytraceback:RuntimeError:func2"] + buckets = cassie.get_package_new_buckets("python-traceback", "1.1", "1.2") + assert buckets == [] + + def test_get_package_new_buckets_nonexistent(self, cassandra_data): + """Test get_package_new_buckets returns empty list for non-existent package""" + buckets = cassie.get_package_new_buckets("nonexistent_package", "1.0", "2.0") + assert buckets == [] + + def test_get_oopses_by_day(self, datetime_now, cassandra_data): + """Test get_oopses_by_day returns list of OOPS IDs for the given day""" + yesterday = (datetime_now - timedelta(days=1)).strftime("%Y%m%d") + oopses = list(cassie.get_oopses_by_day(yesterday)) + assert len(oopses) == 8 + assert all(isinstance(oops, UUID) for oops in oopses) + oopses = list(cassie.get_oopses_by_day(yesterday, limit=6)) + assert len(oopses) == 6 + a_week_ago = (datetime_now - timedelta(days=7)).strftime("%Y%m%d") + oopses = list(cassie.get_oopses_by_day(a_week_ago)) + assert len(oopses) == 1 + + def test_get_oopses_by_day_no_data(self, cassandra_data): + """Test get_oopses_by_day returns empty list for a day with no crashes""" + future_date = "20991231" # Far future date with no crashes + oopses = list(cassie.get_oopses_by_day(future_date)) + assert oopses == [] + + def test_get_oopses_by_release(self, cassandra_data): + """Test get_oopses_by_release returns list of OOPS IDs for the given release""" + oopses = list(cassie.get_oopses_by_release("Ubuntu 24.04")) + assert len(oopses) == 81 + assert all(isinstance(oops, UUID) for oops in oopses) + oopses = list(cassie.get_oopses_by_release("Ubuntu 24.04", limit=6)) + assert len(oopses) == 6 + + def test_get_oopses_by_release_no_data(self, cassandra_data): + """Test get_oopses_by_release returns empty list for a release with no crashes""" + oopses = list(cassie.get_oopses_by_release("Ubuntu 99.99")) + assert oopses == [] + + def test_get_total_buckets_by_day(self, cassandra_data): + """Test get_total_buckets_by_day returns date and count tuples""" + results = list(cassie.get_total_buckets_by_day(0, 7)) + assert len(results) == 7 + assert results[0][1] == 4 + assert results[1][1] == 2 + assert results[2][1] == 1 + assert results[-1][1] == 0 + for date, count in results: + assert isinstance(date, str) + assert len(date) == 8 # YYYYMMDD format + assert isinstance(count, int) + results = list(cassie.get_total_buckets_by_day(30, 31)) + assert len(results) == 1 + assert results[0][1] == 1 + + def test_get_bucket_counts(self, datetime_now, cassandra_data): + """Test get_bucket_counts returns list of (bucket_id, count) tuples""" + results = cassie.get_bucket_counts(release="Ubuntu 24.04", period="week") + assert results == [ + (b"/usr/bin/pytraceback:RuntimeError:func2", 3), + (b"/usr/bin/pytraceback:MemoryError:func3", 1), + (b"/usr/bin/already-bucketed:11:func1:main", 1), + (b"/usr/bin/failed-retrace:11:failed_func:main", 1), + (b"/usr/bin/pytraceback:Exception:func1", 1), + ] + + def test_get_bucket_counts_no_data(self, cassandra_data): + """Test get_bucket_counts returns empty list when no data matches""" + results = cassie.get_bucket_counts(release="Ubuntu 99.99", period="day") + assert results == [] + + def test_get_retracer_count(self, datetime_now, cassandra_data, retracer): + """Test get_retracer_count returns dictionary of retrace statistics""" + release = "Ubuntu 24.04" + yesterday = (datetime_now - timedelta(days=1)).strftime("%Y%m%d") + retracer.update_retrace_stats(release, yesterday, 30, True) + result = cassie.get_retracer_count(yesterday) + assert result == {"Ubuntu 24.04:amd64": {"success": 1}, "Ubuntu 24.04": {"success": 1}} + + def test_get_retracer_count_no_data(self, cassandra_data): + """Test get_retracer_count returns empty dict for date with no stats""" + result = cassie.get_retracer_count("20991231") + assert result == {} + + def test_get_retracer_counts(self, datetime_now, cassandra_data, retracer): + """Test get_retracer_counts returns generator of (date, stats) tuples""" + release = "Ubuntu 24.04" + yesterday = (datetime_now - timedelta(days=1)).strftime("%Y%m%d") + three_days_ago = (datetime_now - timedelta(days=3)).strftime("%Y%m%d") + retracer.update_retrace_stats(release, yesterday, 30, True) + retracer.update_retrace_stats(release, three_days_ago, 30, True) + retracer.update_retrace_stats(release, three_days_ago, 30, True) + results = list(cassie.get_retracer_counts(0, 7)) + assert isinstance(results[0][0], str) + assert len(results[0][0]) == 8 # YYYYMMDD format + assert results[1][1] == { + "Ubuntu 24.04:amd64": {"success": 2}, + "Ubuntu 24.04": {"success": 2}, + } + assert results[3][1] == { + "Ubuntu 24.04:amd64": {"success": 2}, + "Ubuntu 24.04": {"success": 2}, + } + + def test_get_retracer_means(self, datetime_now, cassandra_data, retracer): + """Test get_retracer_means returns list of (date, release_arch_dict) tuples""" + release = distro_info.UbuntuDistroInfo().lts(result="release") + release = "Ubuntu " + release.replace(" LTS", "") + yesterday = (datetime_now - timedelta(days=1)).strftime("%Y%m%d") + three_days_ago = (datetime_now - timedelta(days=3)).strftime("%Y%m%d") + retracer.update_retrace_stats(release, yesterday, 30, True) + retracer.update_retrace_stats(release, three_days_ago, 20, True) + retracer.update_retrace_stats(release, three_days_ago, 60, True) + results = cassie.get_retracer_means(1, 4) + assert isinstance(results[0][0], str) + assert len(results[0][0]) == 8 # YYYYMMDD format + assert results[0][1][release]["amd64"] == 30.0 + assert results[2][1][release]["amd64"] == 35.0 + + def test_get_crash_count(self, datetime_now, cassandra_data): + """Test get_crash_count returns generator of (date, count) tuples""" + results = list(cassie.get_crash_count(0, 7)) + assert isinstance(results[0][0], str) + assert len(results[0][0]) == 8 # YYYYMMDD format + assert results[0][1] == 20 + assert results[2][1] == 7 + + def test_get_crash_count_with_release(self, datetime_now, cassandra_data): + """Test get_crash_count with release parameter returns filtered results""" + results = list(cassie.get_crash_count(0, 7, release="Ubuntu 24.04")) + assert isinstance(results[0][0], str) + assert len(results[0][0]) == 8 # YYYYMMDD format + assert results[0][1] == 19 + assert results[2][1] == 7 + results = list(cassie.get_crash_count(0, 7, release="Ubuntu 26.04")) + assert results[0][1] == 1 + assert len(results) == 1 + + def test_get_average_crashes(self, datetime_now, cassandra_data): + """Test get_average_crashes returns list of (timestamp, average) tuples""" + yesterday = datetime_now.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta( + days=1 + ) + result = cassie.get_average_crashes("python3-traceback", "Ubuntu 24.04", days=7) + assert result[0][0] == int(yesterday.timestamp()) + assert result[0][1] == approx(0.666666666) + + def test_get_average_crashes_no_data(self, cassandra_data): + """Test get_average_crashes returns empty list when no data exists""" + result = cassie.get_average_crashes("python3-traceback", "Ubuntu 99.99", days=7) + assert result == [] + + def test_get_average_instances(self, datetime_now, cassandra_data): + """Test get_average_instances returns generator of (timestamp, average) tuples""" + yesterday = datetime_now.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta( + days=1 + ) + result = list( + cassie.get_average_instances( + "/usr/bin/pytraceback:RuntimeError:func2", "Ubuntu 24.04", days=7 + ) + ) + assert result[0][0] == int(yesterday.timestamp()) + assert result[0][1] == approx(0.333333333) + + def test_get_average_instances_no_data(self, cassandra_data): + """Test get_average_instances returns empty list for non-existent bucket""" + result = list(cassie.get_average_instances("nonexistent", "Ubuntu 24.04", days=7)) + assert result == [] diff --git a/src/tests/test_oopses.py b/src/tests/test_oopses.py index 7dc886b..dd57378 100644 --- a/src/tests/test_oopses.py +++ b/src/tests/test_oopses.py @@ -5,6 +5,7 @@ # the GNU Affero General Public License, version 3 ("AGPLv3"). See the file # LICENSE in the source tree for more information. +import datetime import json import time import uuid @@ -102,7 +103,7 @@ def _test_insert_check(self, oopsid, day_key, value=None): assert value == result["duration"] # The oops has been indexed by day oops_refs = cassandra_schema.DayOOPS.filter(key=day_key.encode()).only(["value"]) - assert [oopsid] == [day_oops.value.decode() for day_oops in oops_refs] + assert oopsid in [day_oops.value.decode() for day_oops in oops_refs] # TODO - the aggregates for the OOPS have been updated. def test_insert_oops_dict(self, temporary_db): @@ -124,12 +125,22 @@ def test_insert_updates_counters(self, temporary_db): day_key = oopses.insert_dict(oopsid, oops, user_token) oops_count = cassandra_schema.Counters.filter(key=b"oopses", column1=day_key) - assert [1] == [count.value for count in oops_count] + assert [3] == [count.value for count in oops_count] oopsid = str(uuid.uuid1()) day_key = oopses.insert_dict(oopsid, oops, user_token) oops_count = cassandra_schema.Counters.filter(key=b"oopses", column1=day_key) - assert [2] == [count.value for count in oops_count] + assert [4] == [count.value for count in oops_count] + + def test_insert_updates_errorsbyrelease(self, temporary_db): + oopsid = str(uuid.uuid1()) + oops = {"DistroRelease": "Ubuntu 42.42", "Date": "Tue Jan 20 14:01:54 2026"} + user_token = "user1" + + oopses.insert_dict(oopsid, oops, user_token) + result = list(cassandra_schema.ErrorsByRelease.filter(key="Ubuntu 42.42")) + assert len(result) == 1 + assert result[0].value == datetime.datetime(2026, 1, 20, 14, 1, 54) class TestBucket: diff --git a/src/tools/remove_old_release_data.py b/src/tools/remove_old_release_data.py index 44e07dd..13f8ffe 100755 --- a/src/tools/remove_old_release_data.py +++ b/src/tools/remove_old_release_data.py @@ -1,20 +1,15 @@ #!/usr/bin/python3 -import os import sys -from datetime import datetime, timedelta -from time import sleep import distro_info from cassandra import OperationTimedOut from cassandra.cluster import NoHostAvailable +from tenacity import retry, retry_if_exception_type, wait_exponential -from errortracker import cassandra +from errortracker import cassandra, cassandra_schema -session = cassandra.cassandra_session() - -oops_lookup_stmt = session.prepare('SELECT * FROM "OOPS" WHERE key=?') -oops_delete_stmt = session.prepare('DELETE FROM "OOPS" WHERE key=? AND column1=?') +cassandra.setup_cassandra() URL = "https://errors.ubuntu.com/oops/" @@ -110,118 +105,32 @@ ) +@retry( + wait=wait_exponential(), retry=retry_if_exception_type((OperationTimedOut, NoHostAvailable)) +) def check_and_remove_oops(oopsid): - data = {} - max_retries = 5 - for i in range(max_retries): - period = 30 + (30 * i) - try: - oops_data = session.execute(oops_lookup_stmt, [oopsid.encode()]) - except (OperationTimedOut, NoHostAvailable): - print(("Sleeping %ss as we timed out when querying." % period)) - sleep(period) - continue - else: - break - else: - print(("Cassandra operation timed out %s times." % max_retries)) - return - # all the column "names" are column1 so make a dictionary of keys: values - for od in oops_data: - data[od.column1] = od.value - # just double check that its the right release - if data.get("DistroRelease", "") == rname: - if data.get("ProcMaps", "") == "": - # print("Skipping already cleaned crash.") + oops_data = cassandra_schema.OOPS.get_as_dict(key=oopsid.encode()) + if oops_data.get("DistroRelease", "") == release_name: + if oops_data.get("Date", "") == "": + print(("%s%s was skipped (already cleaned)" % (URL, oopsid))) return for column in unneeded_columns: - for i in range(max_retries): - period = 30 + (30 * i) - try: - session.execute(oops_delete_stmt, [oopsid.encode(), "%s" % column]) - except (OperationTimedOut, NoHostAvailable): - print(("Sleeping %ss as we timed out when deleting." % period)) - sleep(period) - continue - else: - break - else: - print(("Cassandra operation timed out %s times." % max_retries)) - return - print(("%s%s was from %s and had its data removed" % (URL, oopsid, rname))) + cassandra_schema.OOPS.filter(key=oopsid.encode(), column1=column).delete() + print(("%s%s was from %s and had its data removed" % (URL, oopsid, release_name))) + else: + print( + ("%s%s was from %s and was kept" % (URL, oopsid, oops_data.get("DistroRelease", ""))) + ) -# Main if __name__ == "__main__": - if "--dry-run" in sys.argv: - dry_run = True - sys.argv.remove("--dry-run") - else: - dry_run = False - codename = sys.argv[1] di = distro_info.UbuntuDistroInfo() release = [r for r in di.get_all("object") if r.series == codename][0] # strip out "LTS" - rname = "Ubuntu %s" % release.version.split()[0] - - open_date = release.created - eol_date = release.eol - - # use restart_date if you have to stop and start the job again - restart_date = "" - if restart_date: - open_date = datetime.strptime(restart_date, "%Y-%m-%d").date() - - delta = eol_date - open_date - - for i in range(delta.days + 1): - current_date = open_date + timedelta(days=i) - - removal_progress = "%s-remove_old_%s_data.txt" % ( - current_date, - rname.split(" ")[-1], - ) - if os.path.exists(removal_progress): - with open(removal_progress, "r") as f: - last_row = f.readline() - else: - last_row = "" - - run = 1 - if last_row == "": - r_oopses = session.execute( - 'SELECT * FROM "ErrorsByRelease" ' - "WHERE key = '%s' " - "AND key2 = '%s' LIMIT 5000" % (rname, current_date) - ) - print(("%s %s run: %s" % (rname, current_date, run))) - for r_oops_row in r_oopses: - check_and_remove_oops(str(r_oops_row.column1)) - last_row = str(r_oops_row.column1) - run += 1 - - if last_row == "": - continue + release_name = "Ubuntu %s" % release.version.split()[0] - while run < 150: - r_oopses2 = session.execute( - 'SELECT * FROM "ErrorsByRelease" ' - "WHERE key = '%s' " - "AND key2 = '%s' AND column1 > %s " - "LIMIT 5000" % (release, current_date, last_row) - ) - print(("%s %s run: %s" % (rname, current_date, run))) - r_oops_row = "" - for r_oops_row in r_oopses2: - check_and_remove_oops(str(r_oops_row.column1)) - last_row = str(r_oops_row.column1) - if r_oops_row: - with open(removal_progress, "w") as f: - f.write(str(r_oops_row.column1)) - else: - if os.path.exists(removal_progress): - os.unlink(removal_progress) - break - run += 1 + for row in cassandra_schema.ErrorsByRelease.filter(key=release_name).allow_filtering().all(): + check_and_remove_oops(str(row.column1)) + row.delete() diff --git a/src/tools/unique_users_daily_update.py b/src/tools/unique_users_daily_update.py index 6b46d5c..a1ae86c 100755 --- a/src/tools/unique_users_daily_update.py +++ b/src/tools/unique_users_daily_update.py @@ -11,7 +11,7 @@ cassandra.setup_cassandra() session = cassandra.cassandra_session() -d = distro_info.UbuntuDistroInfo() +UDI = distro_info.UbuntuDistroInfo() # Utilities @@ -24,7 +24,7 @@ def _date_range_iterator(start, finish): # Main -if __name__ == "__main__": +def main(): if "--dry-run" in sys.argv: dry_run = True sys.argv.remove("--dry-run") @@ -33,10 +33,10 @@ def _date_range_iterator(start, finish): releases = [ "Ubuntu " + r.replace(" LTS", "") - for r in sorted(set(d.supported(result="release") + d.supported_esm(result="release"))) + for r in sorted(set(UDI.supported(result="release") + UDI.supported_esm(result="release"))) ] try: - releases.append("Ubuntu " + d.devel(result="release")) + releases.append("Ubuntu " + UDI.devel(result="release")) except distro_info.DistroDataOutdated: print("Distro info outdated, unable to process devel") @@ -84,3 +84,7 @@ def _date_range_iterator(start, finish): ) print(("%s:%s" % (release, len(users)))) print(("from %s days" % day_count)) + + +if __name__ == "__main__": + main()