forked from gwillem/magento-version-identification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_unique_checksums.py
124 lines (95 loc) · 3.77 KB
/
find_unique_checksums.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from glob import glob
from collections import defaultdict
import json
import unittest
import re
"""
Searching for the smallest set of (publicly accessible) filenames that identify a Magento version.
"""
def _sort_filehash_on_granularity_and_impact(filehash):
# granularity = how many versions does this filehash identify (ideally, 1)
# impact = how many versions have a unique hash for this file (reverse sort)
filename, hash = filehash
granularity = len(md5sums[filename][hash])
impact = -len(unique_sums[filename])
return (granularity, impact)
def humanize(versions):
# squash multiple versions, assume ordered
allversions = [re.split('[\.\ ]', x) for x in versions]
minlength = min([len(x) for x in allversions])
output = []
for i in range(minlength):
items_at_pos_i = set([x[i] for x in allversions])
if len(items_at_pos_i) != 1:
output.append('x')
break
output.append(items_at_pos_i.pop())
if len(output) <= 1:
return ', '.join(versions)
return output.pop(0) + ' ' + '.'.join(output)
if __name__ == '__main__':
sources = glob('md5sums/*')
md5sums = defaultdict(lambda: defaultdict(list))
"""
md5sums = {
'file1' : {
'hash1': ['magento1','magento2'],
'hash2': ...
}
}
"""
releases = defaultdict(dict)
unique_sums = defaultdict(dict)
"""
unique_sums = {
'file1' : {
'hash1' : 'releasex',
'hash2' : 'releasey'
}
}
"""
releases_hashes = defaultdict(list)
for source in sources:
release = source.split('/')[1].replace('magento-', '').replace('-', ' ')
with open(source) as fh:
for line in fh:
if 'skin/frontend' in line:
continue
md5, name = line.strip().split()
releases[release][name] = md5
md5sums[name][md5].append(release)
for filename, hashes in md5sums.iteritems():
for hash, versions in hashes.iteritems():
if len(versions) > 1:
continue
release = versions[0]
unique_sums[filename][hash] = release
releases_hashes[release].append(hash)
fingerprints = defaultdict(dict)
for version, files in sorted(releases.items()):
# find the file/hash from files which
# has the least amount of versions attached (preferably just 1).
ordered_files = sorted(files.items(), key=_sort_filehash_on_granularity_and_impact)
filename, hash = ordered_files[0]
all_versions_for_this_hash = md5sums[filename][hash]
fingerprints[filename][hash] = humanize(sorted(all_versions_for_this_hash))
# After we determine which files to include, add all unique hashes to
# these file entries. Some versions are now listed multiple times. This
# reduces the number of files we need to request and gives better results
# if a site has modified some files.
for filename in fingerprints.keys():
for hash, release in unique_sums[filename].items():
fingerprints[filename][hash] = release
print json.dumps(fingerprints, indent=4)
with open('version_hashes.json', 'w') as f:
f.write(json.dumps(fingerprints, indent=4))
class TestIt(unittest.TestCase):
def test_humanize(self):
tests = (
(['CE 1.3.2.3', 'CE 1.3.2.4', 'CE 1.3.3.0'], 'CE 1.3.x'),
(['CE 1.7.0.1', 'CE 1.7.0.2'], 'CE 1.7.0.x'),
(['CE 1.7.0.1', 'EE 1.7.0.2'], 'CE 1.7.0.1, EE 1.7.0.2'),
)
for test, expected in tests:
real = humanize(test)
self.assertEqual(real, expected)