Debian Neglected RC Bugs

#!/usr/bin/env python """Mine Debian release-critical bug data and list interesting items John Belmonte """ import re import urllib import time import cgi import os RC_LIST_URL = 'http://bugs.debian.org/release-critical/debian/main.html' BUG_URL = 'http://bugs.debian.org/%d' BUG_MBOX_URL = 'http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=%s&mbox=yes' DATE_PATTERN = '%d %b %Y %H:%M:%S' OUTPUT_NAME = 'neglected-bugs.html' HTML_TEMPLATE = """\ Debian Neglected RC Bugs

This is a list of Debian bugs having all of the following attributes:

in the main archive
release-critical
no follow-ups
older than %(min_days)d days

Generated on %(current_date_string)s by mine-bugs.

%(item_count)d items, sorted by descending bug number.

Bug No.  Days  Package           Bug Title
-------  ----  ----------------  ---------
%(output_list)s

""" #bug_list_html = open('main.html').read() bug_list_html = urllib.urlopen(RC_LIST_URL).read() bug_re = re.compile(r'', re.IGNORECASE) bug_numbers = bug_re.findall(bug_list_html) current_time = time.time() current_date_string = time.strftime('%a %d %b %H:%M:%S UTC %Y', time.gmtime()) min_days = 14 bug_records = [] package_re = re.compile(r'^Package:\s*(.*?)$', re.MULTILINE | re.IGNORECASE) subject_re = re.compile(r'^Subject: (.*?)$', re.MULTILINE) from_re = re.compile(r'^From .*?^Date: (.*?)$', re.MULTILINE | re.DOTALL) for bug_number in bug_numbers: url = BUG_MBOX_URL % bug_number #print url bug_mbox = urllib.urlopen(url).read() try: package = package_re.search(bug_mbox).group(1) except AttributeError: continue try: subject = subject_re.search(bug_mbox).group(1) except AttributeError: subject = '(no subject)' from_list = from_re.findall(bug_mbox) num_followups = len(from_list) - 1 last_time_string = from_list[-1] #print '%3s followups, last is %s' % (num_followups, last_time_string) last_time_offset = '+0000' # remove extraneous stuff from date string last_time_string = re.sub(r' $.*?$', '', last_time_string) last_time_string = re.sub(r'^\w+?, ', '', last_time_string) last_time_string = re.sub(r' GMT', '', last_time_string) # separate offset, handle missing offset match = re.search(r'^(.*) ([+-]\d+)$', last_time_string) if match: last_time_string, last_time_offset = match.groups() # TODO: process date offset try: last_time = time.mktime(time.strptime(last_time_string, DATE_PATTERN)) except (ValueError, OverflowError), e: #print e continue elapsed_seconds = max(current_time - last_time, 0) elapsed_days = elapsed_seconds / (60 * 60 * 24) #print '%s %3d %s' % (bug_number, int(elapsed_days), subject) class record: pass record.bug_number = int(bug_number) record.elapsed_days = elapsed_days record.package = package record.subject = subject record.num_followups = num_followups bug_records.append(record) #if len(bug_records) >= 50: break def is_neglected(record): return record.num_followups <= 0 and record.elapsed_days >= min_days def sort_by_bug_num(a, b): return cmp(b.bug_number, a.bug_number) def shrink(s, num_characters): """return s truncated to num_characters, adding "..." if needed""" suffix = '...' if len(s) > num_characters: return s[0:num_characters-len(suffix)] + suffix else: return s def record_to_html(record): # remove package name prefix from bug title, since it's redundant subject = re.sub('^%s[:_]?\s*' % re.escape(record.package), '', record.subject) return '%7d %4d %-16s %s' % ( BUG_URL % record.bug_number, record.bug_number, int(record.elapsed_days), shrink(record.package, 16), cgi.escape(subject)) output_records = filter(is_neglected, bug_records) output_records.sort(sort_by_bug_num) item_count = len(output_records) output_list = '\n'.join(map(record_to_html, output_records)) output_html = HTML_TEMPLATE % vars() if os.path.exists(OUTPUT_NAME): os.rename(OUTPUT_NAME, OUTPUT_NAME + '.bak') open(OUTPUT_NAME, 'w').write(output_html)