Merge lp://staging/~jameinel/bzr-stats/better-matching into lp://staging/bzr-stats

Proposed by John A Meinel
Status: Merged
Merged at revision: not available
Proposed branch: lp://staging/~jameinel/bzr-stats/better-matching
Merge into: lp://staging/bzr-stats
Diff against target: 289 lines (+114/-89)
1 file modified
__init__.py (+114/-89)
To merge this branch: bzr merge lp://staging/~jameinel/bzr-stats/better-matching
Reviewer Review Type Date Requested Status
bzr-stats developers Pending
Review via email: mp+17483@code.staging.launchpad.net
To post a comment you must log in.
Revision history for this message
John A Meinel (jameinel) wrote :

This is a fairly major overhaul of the committer matching logic. Recent revisions had gotten even more confused, and started matching all of the others in a single commit together.

This changes the logic so that:

1) Start by breaking committers into (user, email) pairs.
2) For all revisions, grab the apparent authors
3) Map email => usernames
4) Map email => generic_id and username => generic_id and generic_id => (user, email) pairs.
When you find a collision (a username or email has been seen before) then you collapse the old values into the new entry.
5) Never collapse if email or username is empty. (so you can match (user, '') to (user, *) but you won't ever try to collapse all of the (*, '') or ('', *) entries together)

This means we actually get decent stats on bzr.dev again.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file '__init__.py'
2--- __init__.py 2009-07-17 18:07:09 +0000
3+++ __init__.py 2010-01-15 21:48:14 +0000
4@@ -1,4 +1,4 @@
5-# Copyright (C) 2005-2008 Canonical Ltd
6+# Copyright (C) 2006-2010 Canonical Ltd
7
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10@@ -23,6 +23,7 @@
11 config,
12 errors,
13 option,
14+ trace,
15 tsort,
16 ui,
17 workingtree,
18@@ -32,19 +33,7 @@
19 """)
20
21
22-def find_fullnames(lst):
23- """Find the fullnames for a list committer names."""
24-
25- counts = {}
26- for committer in lst:
27- fullname = config.parse_username(committer)[0]
28- counts.setdefault(fullname, 0)
29- counts[fullname] += 1
30- return sorted(((count, name) for name,count in counts.iteritems()),
31- reverse=True)
32-
33-
34-def collapse_by_person(committers):
35+def collapse_by_person(revisions, canonical_committer):
36 """The committers list is sorted by email, fix it up by person.
37
38 Some people commit with a similar username, but different email
39@@ -55,65 +44,116 @@
40 So take the most common username for each email address, and
41 combine them into one new list.
42 """
43- # Just an indirection so that multiple names can reference
44- # the same record information
45- name_to_counter = {}
46- # indirection back to real information
47- # [[full_rev_list], {email:count}, {fname:count}]
48- counter_to_info = {}
49- counter = 0
50- for email, revs in committers.iteritems():
51- authors = []
52- for rev in revs:
53- authors += rev.get_apparent_authors()
54- fullnames = find_fullnames(authors)
55- match = None
56- for count, fullname in fullnames:
57- if fullname and fullname in name_to_counter:
58- # We found a match
59- match = name_to_counter[fullname]
60- break
61-
62- if match:
63- # One of the names matched, we need to collapse to records
64- record = counter_to_info[match]
65- record[0].extend(revs)
66- record[1][email] = len(revs)
67- for count, fullname in fullnames:
68- name_to_counter[fullname] = match
69- record[2].setdefault(fullname, 0)
70- record[2][fullname] += count
71- else:
72- # just add this one to the list
73- counter += 1
74- for count, fullname in fullnames:
75- if fullname:
76- name_to_counter[fullname] = counter
77- fname_map = dict((fullname, count) for count, fullname in fullnames)
78- counter_to_info[counter] = [revs, {email:len(revs)}, fname_map]
79- return sorted(((len(revs), revs, email, fname)
80- for revs, email, fname in counter_to_info.values()), reverse=True)
81-
82-
83-def sort_by_committer(a_repo, revids):
84- committers = {}
85+ # Map from canonical committer to
86+ # {committer: ([rev_list], {email: count}, {fname:count})}
87+ committer_to_info = {}
88+ for rev in revisions:
89+ authors = rev.get_apparent_authors()
90+ for author in authors:
91+ username, email = config.parse_username(author)
92+ canon_author = canonical_committer[(username, email)]
93+ info = committer_to_info.setdefault(canon_author, ([], {}, {}))
94+ info[0].append(rev)
95+ info[1][email] = info[1].setdefault(email, 0) + 1
96+ info[2][username] = info[2].setdefault(username, 0) + 1
97+ res = [(len(revs), revs, emails, fnames)
98+ for revs, emails, fnames in committer_to_info.itervalues()]
99+ res.sort(reverse=True)
100+ return res
101+
102+
103+def collapse_email_and_users(email_users, combo_count):
104+ """Combine the mapping of User Name to email and email to User Name.
105+
106+ If a given User Name is used for multiple emails, try to map it all to one
107+ entry.
108+ """
109+ id_to_combos = {}
110+ username_to_id = {}
111+ email_to_id = {}
112+ id_counter = 0
113+
114+ def collapse_ids(old_id, new_id, new_combos):
115+ old_combos = id_to_combos.pop(old_id)
116+ new_combos.update(old_combos)
117+ for old_user, old_email in old_combos:
118+ if (old_user and old_user != user):
119+ old_user_id = username_to_id[old_user]
120+ assert old_user_id in (old_id, new_id)
121+ username_to_id[old_user] = new_id
122+ if (old_email and old_email != email):
123+ old_email_id = email_to_id[old_email]
124+ assert old_email_id in (old_id, new_id)
125+ email_to_id[old_email] = cur_id
126+ for email, usernames in email_users.iteritems():
127+ assert email not in email_to_id
128+ if not email:
129+ # We use a different algorithm for usernames that have no email
130+ # address, we just try to match by username, and not at all by
131+ # email
132+ for user in usernames:
133+ if not user:
134+ continue # The mysterious ('', '') user
135+ user_id = username_to_id.get(user)
136+ if user_id is None:
137+ id_counter += 1
138+ user_id = id_counter
139+ username_to_id[user] = user_id
140+ id_to_combos[user_id] = id_combos = set()
141+ else:
142+ id_combos = id_combos[user_id]
143+ id_combos.add((user, email))
144+ continue
145+
146+ id_counter += 1
147+ cur_id = id_counter
148+ id_to_combos[cur_id] = id_combos = set()
149+ email_to_id[email] = cur_id
150+
151+ for user in usernames:
152+ combo = (user, email)
153+ id_combos.add(combo)
154+ if not user:
155+ # We don't match on empty usernames
156+ continue
157+ user_id = username_to_id.get(user)
158+ if user_id is not None:
159+ # This UserName was matched to an cur_id
160+ if user_id != cur_id:
161+ # And it is a different identity than the current email
162+ collapse_ids(user_id, cur_id, id_combos)
163+ username_to_id[user] = cur_id
164+ combo_to_best_combo = {}
165+ for cur_id, combos in id_to_combos.iteritems():
166+ best_combo = sorted(combos,
167+ key=lambda x:combo_count[x],
168+ reverse=True)[0]
169+ for combo in combos:
170+ combo_to_best_combo[combo] = best_combo
171+ return combo_to_best_combo
172+
173+
174+def get_revisions_and_committers(a_repo, revids):
175+ """Get the Revision information, and the best-match for committer."""
176+
177+ email_users = {} # user@email.com => User Name
178+ combo_count = {}
179 pb = ui.ui_factory.nested_progress_bar()
180 try:
181- pb.note('getting revisions')
182+ trace.note('getting revisions')
183 revisions = a_repo.get_revisions(revids)
184 for count, rev in enumerate(revisions):
185 pb.update('checking', count, len(revids))
186 for author in rev.get_apparent_authors():
187- username = config.parse_username(author)
188- if username[1] == '':
189- email = username[0]
190- else:
191- email = username[1]
192- committers.setdefault(email, []).append(rev)
193+ # XXX: There is a chance sometimes with svn imports that the
194+ # full name and email can BOTH be blank.
195+ username, email = config.parse_username(author)
196+ email_users.setdefault(email, set()).add(username)
197+ combo = (username, email)
198+ combo_count[combo] = combo_count.setdefault(combo, 0) + 1
199 finally:
200 pb.finished()
201-
202- return committers
203+ return revisions, collapse_email_and_users(email_users, combo_count)
204
205
206 def get_info(a_repo, revision):
207@@ -121,15 +161,14 @@
208 pb = ui.ui_factory.nested_progress_bar()
209 a_repo.lock_read()
210 try:
211- pb.note('getting ancestry')
212+ trace.note('getting ancestry')
213 ancestry = a_repo.get_ancestry(revision)[1:]
214-
215- committers = sort_by_committer(a_repo, ancestry)
216+ revs, canonical_committer = get_revisions_and_committers(a_repo, ancestry)
217 finally:
218 a_repo.unlock()
219 pb.finished()
220
221- return collapse_by_person(committers)
222+ return collapse_by_person(revs, canonical_committer)
223
224
225 def get_diff_info(a_repo, start_rev, end_rev):
226@@ -138,7 +177,6 @@
227 This lets us figure out what has actually changed between 2 revisions.
228 """
229 pb = ui.ui_factory.nested_progress_bar()
230- committers = {}
231 a_repo.lock_read()
232 try:
233 pb.note('getting ancestry 1')
234@@ -146,23 +184,12 @@
235 pb.note('getting ancestry 2')
236 ancestry = a_repo.get_ancestry(end_rev)[1:]
237 ancestry = [rev for rev in ancestry if rev not in start_ancestry]
238- pb.note('getting revisions')
239- revisions = a_repo.get_revisions(ancestry)
240-
241- for count, rev in enumerate(revisions):
242- pb.update('checking', count, len(ancestry))
243- for author in rev.get_apparent_authors():
244- try:
245- email = config.extract_email_address(author)
246- except errors.BzrError:
247- email = author
248- committers.setdefault(email, []).append(rev)
249+ revs, canonical_committer = sort_by_committer(a_repo, ancestry)
250 finally:
251 a_repo.unlock()
252 pb.finished()
253
254- info = collapse_by_person(committers)
255- return info
256+ return collapse_by_person(revs, canonical_committer)
257
258
259 def display_info(info, to_file, gather_class_stats=None):
260@@ -176,9 +203,7 @@
261 sorted_fullnames = sorted(((count, fullname)
262 for fullname,count in fullnames.iteritems()),
263 reverse=True)
264- # There is a chance sometimes with svn imports that the full name and
265- # email can BOTH be blank.
266- if sorted_fullnames[0][1] == '':
267+ if sorted_fullnames[0][1] == '' and sorted_emails[0][1] == '':
268 to_file.write('%4d %s\n'
269 % (count, 'Unknown'))
270 else:
271@@ -186,15 +211,15 @@
272 % (count, sorted_fullnames[0][1],
273 sorted_emails[0][1]))
274 if len(sorted_fullnames) > 1:
275- print ' Other names:'
276- for count, fname in sorted_fullnames[1:]:
277+ to_file.write(' Other names:\n')
278+ for count, fname in sorted_fullnames:
279 to_file.write(' %4d ' % (count,))
280 if fname == '':
281 to_file.write("''\n")
282 else:
283 to_file.write("%s\n" % (fname,))
284 if len(sorted_emails) > 1:
285- print ' Other email addresses:'
286+ to_file.write(' Other email addresses:\n')
287 for count, email in sorted_emails:
288 to_file.write(' %4d ' % (count,))
289 if email == '':

Subscribers

People subscribed via source and target branches

to all changes: