Generate Sample Data using Faker
In this article lets see how we can generate sample data in CSV or Json format that will be needed for our quick testing of Data Pipeline jobs.
Faker is a Python package that generates fake data for us. Whether you need to bootstrap your database, create good-looking XML documents, fill-in your persistence to stress test it, or anonymize data taken from a production service, Faker is for you.
Refer: https://faker.readthedocs.io/en/master/ for installation and basic usage.
Script to generate sample data in CSV/Json format
Below is the script to generate the fake data needed for quick testing in desired format (CSV/JSON).
import csv
from faker import Faker
import argparse
import json
from decimal import *
from datetime import date, datetime
fake = Faker()
class CustomEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return str(obj)
if isinstance(obj, (datetime, date)):
return obj.isoformat() # or return str(obj)
return json.JSONEncoder.default(self, obj)
def main():
parser = argparse.ArgumentParser(description='Faker')
parser.add_argument('--output_format', default="csv", type=str, help='Pass the output format. Supported json/csv')
parser.add_argument('--number_of_rows', default=100, help='Pass the number of rows needed in the output file')
args = parser.parse_args()
output_format = args.output_format.lower()
number_of_rows = int(args.number_of_rows)
with open("sample_data." + output_format, 'w') as file:
for i in range(number_of_rows):
data = fake.profile()
if i == 0 and output_format == "csv":
header = data.keys()
writer = csv.DictWriter(file, fieldnames=header)
writer.writeheader()
elif i != 0 and output_format == "csv":
writer.writerow(data)
elif output_format == "json":
json_str = json.dumps(data, cls=CustomEncoder)
file.write(json_str)
file.write("\n")
print("Sample data generation done")
if __name__ == '__main__':
main()
Using the Python’s faker module along with Python’s random module we can generate datasets as per our needs. If we need to test our ETL pipelines and need a dataset for quickly testing the behaviour of our ETL job for various datatypes I would recommend generating the dataset locally rather than searching such datasets online.
Example:
import decimal
int_col = random.randint(1000, 10000)
decimal_col = int_col/2.123243 # or random.uniform(10.5, 75.5)
long_col = random.randint(1000, 10000) * random.randint(1000000, 1000000000)
float_col = float(decimal.Decimal(random.randrange(155, 389))/100)
date_col = fake.date_between(start_date='-90d', end_date='today')
start_date = datetime.now()
from datetime import timedelta
end_date = start_date + timedelta(days=10)
random_date = start_date + (end_date - start_date) * random.random()
timestamp_col = str(random_date)
string_col = fake.name()
That’s all. I hope you enjoy!