From f33eea686a5f4d5517eaeb173986f4d445831db8 Mon Sep 17 00:00:00 2001 From: Ryan Gaffney Date: Tue, 5 Apr 2016 14:52:24 -0700 Subject: [PATCH] Replace faker with fake-factory --- anonymizer/base.py | 40 ++++++------- anonymizer/introspect.py | 15 +---- anonymizer/replacers.py | 29 ++++------ anonymizer/tests/models.py | 26 +++++++-- anonymizer/tests/tests.py | 115 ++++++++++++++++++++++++++----------- setup.py | 2 +- 6 files changed, 131 insertions(+), 96 deletions(-) diff --git a/anonymizer/base.py b/anonymizer/base.py index 7e9d123..22bcb2f 100644 --- a/anonymizer/base.py +++ b/anonymizer/base.py @@ -1,18 +1,16 @@ import decimal import random -import six - -from six.moves import xrange - from collections import defaultdict from datetime import datetime from multiprocessing import Pool from uuid import uuid4 +import six from anonymizer import replacers from django.db import connection, transaction -from faker import Faker, data -from faker.utils import bothify, uk_postcode +from faker import Faker + +from six.moves import xrange randrange = random.SystemRandom().randrange @@ -73,13 +71,9 @@ def get_allowed_value(self, source, field): return retval - # Public interface def uuid(self, field=None): - retval = str(uuid4()) - max_length = getattr(field, 'max_length', None) - if max_length is not None: - retval = retval[:max_length] - return retval + # bypass chopping from max_length + return str(uuid4()) def varchar(self, field=None): """ @@ -98,7 +92,7 @@ def simple_pattern(self, pattern, field=None): Use a simple pattern to make the field - # is replaced with a random number, ? with a random letter. """ - return self.get_allowed_value(lambda: bothify(pattern), field) + return self.get_allowed_value(lambda: self.faker.bothify(pattern), field) def bool(self, field=None): """ @@ -144,14 +138,11 @@ def source(): return decimal.Decimal(random.randrange(0, 100000))/(10**field.decimal_places) return self.get_allowed_value(source, field) - def uk_postcode(self, field=None): - return self.get_allowed_value(uk_postcode, field) - - def uk_county(self, field=None): - return self.get_allowed_value(lambda: random.choice(data.UK_COUNTIES), field) + def postcode(self, field=None): + return self.get_allowed_value(self.faker.postcode, field) - def uk_country(self, field=None): - return self.get_allowed_value(lambda: random.choice(data.UK_COUNTRIES), field) + def country(self, field=None): + return self.get_allowed_value(self.faker.country, field) def lorem(self, field=None, val=None): """ @@ -164,7 +155,7 @@ def generate(length): # Get lorem ipsum of a specific length. collect = "" while len(collect) < length: - collect += self.faker.lorem() + collect += ' %s' % self.faker.sentence() collect = collect[:length] return collect @@ -177,7 +168,8 @@ def source(): parts[i] = generate(len(p)) return "\n".join(parts) else: - source = self.faker.lorem + def source(): + return ' '.join(self.faker.sentences()) return self.get_allowed_value(source, field) def unique_lorem(self, field=None, val=None): @@ -202,12 +194,12 @@ def choice(self, field=None): return self.get_allowed_value(lambda: random.choice(choices), field) # Other attributes provided by 'Faker': - # username + # user_name # first_name # last_name # name # email - # full_address + # address # phonenumber # street_address # city diff --git a/anonymizer/introspect.py b/anonymizer/introspect.py index 2c100e2..0a827a5 100644 --- a/anonymizer/introspect.py +++ b/anonymizer/introspect.py @@ -30,9 +30,6 @@ (r'(\b|_)email\d*', '"email"'), (r'(\b|_)town\d*', '"city"'), (r'(\b|_)city\d*', '"city"'), - (r'(\b|_)county\d*', '"uk_county"'), - (r'(\b|_)post_code\d*', '"uk_postcode"'), - (r'(\b|_)postcode\d*', '"uk_postcode"'), (r'(\b|_)zip\d*', '"zip_code"'), (r'(\b|_)zipcode\d*', '"zip_code"'), (r'(\b|_)zip_code\d*', '"zip_code"'), @@ -97,17 +94,7 @@ class %(modelname)sAnonymizer(Anonymizer): def create_anonymizer(model): attributes = [] - fields = list(model._meta.fields) - # For the faker.name/username/email magic to work as expected and produce - # consistent sets of names/email addreses, they must be accessed in the - # same order. This will usually not be a problem, but if duplicate names - # are produced and the field is unique=True, the logic in DjangoFaker for - # getting new values from the 'source' means that the order will become out - # of sync. To avoid this, we put fields with 'unique=True' at the beginning - # of the list. Usually this will only be the username. - fields.sort(key=lambda f: not getattr(f, 'unique', False)) - - for f in fields: + for f in model._meta.fields: replacer = get_replacer_for_field(f) attributes.append(attribute_template % {'attname': f.attname, 'replacer': replacer}) diff --git a/anonymizer/replacers.py b/anonymizer/replacers.py index 5b00f16..ac10b74 100644 --- a/anonymizer/replacers.py +++ b/anonymizer/replacers.py @@ -71,32 +71,25 @@ def decimal(anon, obj, field, val): return anon.faker.decimal(field=field) -def uk_postcode(anon, obj, field, val): +def postcode(anon, obj, field, val): """ - Generates a random UK postcode (not necessarily valid, but it will look like one). + Generates a random postcode (not necessarily valid, but it will look like one). """ - return anon.faker.uk_postcode(field=field) + return anon.faker.postcode(field=field) -def uk_country(anon, obj, field, val): +def country(anon, obj, field, val): """ - Returns a randomly selected country that is part of the UK + Returns a randomly selected country. """ - return anon.faker.uk_country(field=field) - - -def uk_county(anon, obj, field, val): - """ - Returns a randomly selected county from the UK - """ - return anon.faker.uk_county(field=field) + return anon.faker.country(field=field) def username(anon, obj, field, val): """ Generates a random username """ - return anon.faker.username(field=field) + return anon.faker.user_name(field=field) def first_name(anon, obj, field, val): @@ -139,14 +132,14 @@ def full_address(anon, obj, field, val): Generates a random full address, using newline characters between the lines. Resembles a US address """ - return anon.faker.full_address(field=field) + return anon.faker.address(field=field) def phonenumber(anon, obj, field, val): """ Generates a random US-style phone number """ - return anon.faker.phonenumber(field=field) + return anon.faker.phone_number(field=field) def street_address(anon, obj, field, val): @@ -174,7 +167,7 @@ def zip_code(anon, obj, field, val): """ Returns a randomly generated US zip code (not necessarily valid, but will look like one). """ - return anon.faker.zip_code(field=field) + return anon.faker.zipcode(field=field) def company(anon, obj, field, val): @@ -188,7 +181,7 @@ def lorem(anon, obj, field, val): """ Generates a paragraph of lorem ipsum text """ - return anon.faker.lorem(field=field) + return ' '.join(anon.faker.sentences(field=field)) def unique_lorem(anon, obj, field, val): diff --git a/anonymizer/tests/models.py b/anonymizer/tests/models.py index 9746c80..43a957c 100644 --- a/anonymizer/tests/models.py +++ b/anonymizer/tests/models.py @@ -1,3 +1,5 @@ +import uuid + from django.db import models @@ -9,12 +11,11 @@ class EverythingModel(models.Model): name = models.CharField(max_length=30) email = models.EmailField() username = models.CharField(max_length=20, unique=True) - address_city = models.CharField(max_length=50) - address_post_code = models.CharField(max_length=10) address = models.TextField() o1 = models.ForeignKey(Other) - something = models.TextField() - something_else = models.TextField() + lorem = models.TextField() + similar_lorem = models.TextField() + unique_lorem = models.TextField(unique=True) some_varchar = models.CharField(max_length=5) birthday = models.DateTimeField() age = models.PositiveSmallIntegerField() @@ -25,3 +26,20 @@ class EverythingModel(models.Model): ('F', 'Female')]) price = models.DecimalField(decimal_places=2, max_digits=10) binary = models.BinaryField() + uuid = models.UUIDField(default=uuid.uuid4) + boolean = models.BooleanField() + small_integer = models.SmallIntegerField() + positive_small_integer = models.PositiveSmallIntegerField() + postcode = models.CharField(max_length=9) + country = models.CharField(max_length=45) + first_name = models.CharField(max_length=5) + last_name = models.CharField(max_length=5) + similar_email = models.EmailField() + phonenumber = models.CharField(max_length=10) + last_name = models.CharField(max_length=11) + street_address = models.CharField(max_length=15) + state = models.CharField(max_length=2) + zip_code = models.CharField(max_length=9) + company = models.CharField(max_length=30) + similar_datetime = models.DateTimeField() + similar_date = models.DateField() diff --git a/anonymizer/tests/tests.py b/anonymizer/tests/tests.py index 4639a69..27d95e9 100644 --- a/anonymizer/tests/tests.py +++ b/anonymizer/tests/tests.py @@ -1,14 +1,15 @@ -from datetime import datetime, timedelta, date import decimal -import six +import uuid import zlib +from datetime import date, datetime, timedelta +import six +from anonymizer import Anonymizer, introspect +from anonymizer.tests import models as test_models from django.apps import apps from django.test import TestCase -from six.moves import xrange -from anonymizer import Anonymizer, introspect -from anonymizer.tests import models as test_models +from six.moves import xrange def compress(num): @@ -43,15 +44,14 @@ class EverythingModelAnonymizer(Anonymizer): attributes = [ ('id', "SKIP"), - ('username', "username"), ('name', "name"), ('email', "email"), - ('address_city', "city"), - ('address_post_code', "uk_postcode"), + ('username', "username"), ('address', "full_address"), ('o1_id', "SKIP"), - ('something', "lorem"), - ('something_else', "lorem"), + ('lorem', "lorem"), + ('similar_lorem', "lorem"), + ('unique_lorem', "lorem"), ('some_varchar', "varchar"), ('birthday', "datetime"), ('age', "positive_small_integer"), @@ -61,6 +61,22 @@ class EverythingModelAnonymizer(Anonymizer): ('sex', "choice"), ('price', "decimal"), ('binary', UNKNOWN_FIELD), + ('uuid', UNKNOWN_FIELD), + ('boolean', "bool"), + ('small_integer', "small_integer"), + ('positive_small_integer', "positive_small_integer"), + ('postcode', "varchar"), + ('country', "varchar"), + ('first_name', "first_name"), + ('similar_email', "email"), + ('phonenumber', "phonenumber"), + ('last_name', "last_name"), + ('street_address', "full_address"), + ('state', "state"), + ('zip_code', "zip_code"), + ('company', "varchar"), + ('similar_datetime', "datetime"), + ('similar_date', "date"), ] """ self.assertEqual(mod.strip(), expected.strip()) @@ -77,18 +93,37 @@ def setUp(self): self.now = datetime.now() self.today = date.today() - instances = (test_models.EverythingModel(id=x, - o1=self.o1, - username="intial%d" % x, - birthday=self.now + timedelta(365 * x), - age=x, - some_datetime=self.now, - some_date=self.today, - sex='X', - price=decimal.Decimal("1.23"), - binary=compress(x), - ) - for x in xrange(1, self.NUM_ITEMS + 1)) + instances = (test_models.EverythingModel( + id=x, + o1=self.o1, + username="intial%d" % x, + lorem='hello world', + similar_lorem='Hello, world!', + unique_lorem='Hello, world! #%d' % x, + birthday=self.now + timedelta(365 * x), + age=x, + some_datetime=self.now, + some_date=self.today, + sex='X', + price=decimal.Decimal("1.23"), + binary=compress(x), + uuid=str(uuid.uuid4()), + boolean=True, + small_integer=-1, + positive_small_integer=1, + postcode='12345-12345', + country='Qatar', + first_name='Joe', + last_name='Schmoe', + similar_email='monkey@betterworks.com', + phonenumber='(555) 555-5555', + street_address='123 Maple Street', + state='CA', + zip_code=94063, + company='BetterWorks', + similar_datetime=self.now, + similar_date=self.today, + ) for x in xrange(1, self.NUM_ITEMS + 1)) test_models.EverythingModel.objects.bulk_create(instances) @@ -107,11 +142,10 @@ class EverythingAnonmyizer(Anonymizer): ('username', 'username'), ('name', "name"), ('email', "email"), - ('address_city', "city"), - ('address_post_code', "uk_postcode"), ('address', "full_address"), - ('something', "lorem"), - ('something_else', "similar_lorem"), + ('lorem', "lorem"), + ('similar_lorem', "similar_lorem"), + ('unique_lorem', "unique_lorem"), ('some_varchar', "varchar"), ('birthday', "datetime"), ('age', "positive_small_integer"), @@ -119,21 +153,32 @@ class EverythingAnonmyizer(Anonymizer): ('some_date', "date"), ('sex', "choice"), ('price', "decimal"), - ('binary', lambda anon, obj, field, val: compress(decompress(val) * -1)) + ('binary', lambda anon, obj, field, val: compress(decompress(val) * -1)), + ('boolean', "bool"), + ('uuid', "uuid"), + ('small_integer', "small_integer"), + ('positive_small_integer', "positive_small_integer"), + ('postcode', "postcode"), + ('country', "country"), + ('first_name', "first_name"), + ('last_name', "last_name"), + ('similar_email', "similar_email"), + ('phonenumber', "phonenumber"), + ('street_address', "street_address"), + ('state', "state"), + ('zip_code', "zip_code"), + ('company', "company"), + ('similar_datetime', "similar_datetime"), + ('similar_date', "similar_date"), ] EverythingAnonmyizer().run(parallel=0) - objs = test_models.EverythingModel.objects.all() - self.assertEqual(len(objs), self.NUM_ITEMS) - for o in objs: + instances = test_models.EverythingModel.objects.all() + self.assertEqual(len(instances), self.NUM_ITEMS) + for o in instances: # check everything has been changed self.assertFalse(o.username.startswith('initial')) - # Check for corresponding user names/emails. This works if username - # is first in the list, as recommended and as introspection - # generates. - self.assertTrue(o.email.startswith(o.username)) - # test for DjangoFaker.choice self.assertTrue(o.sex in ('M', 'F')) diff --git a/setup.py b/setup.py index fc6dc2b..51ba2bb 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def read(*rnames): "Topic :: Database" ], install_requires=[ - 'faker >= 0.0.4-bw', + 'fake-factory >= 0.5.6', 'django >= 1.8.0', 'six >= 1.10.0'], )