How to stop Pytest from appending to CSV created after initial test

2022-12-07 20:17 问答作者：

I am testing a class of functions that apply specific transformations to columns of a csv file retrieved from an S3 bucket. The test functions should retrieve the 'test_data.csv' file from the S3 bucket created using the levels_etl and levels_etl_with_test_csv_data fixtures create a new CSV with the transformations applied.

The problem I am having is that each of the开发者_运维知识库 test functions pass, when run individually, but when run as part of a class, the first test runs successfully, but all the other tests fail, because for some reason, instead of creating a new CSV with the transformations applied, the CSV output is appended to the CSV created in the previous test thus causing the assertions to fail, with each successive test appending to the CSV.

Setup Code:

@pytest.fixture
def levels_etl():
    # Mocking S3 connection start
    mock_bucket=mock_s3()
    mock_bucket.start()
    # Defining Class Arguments
    s3_access_key='AWS_ACCESS_KEY_ID'
    s3_secret_key='AWS_SECRET_ACCESS_KEY'
    s3_endpoint_url='https://s3.us-east-2.amazonaws.com'
    s3_bucket_name='test-bucket'
    # Creating s3 access keys as environment variables
    os.environ[s3_access_key]='KEY1'
    os.environ[s3_secret_key]='KEY2'
    s3=boto3.resource(service_name='s3',endpoint_url=s3_endpoint_url)
    s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={'LocationConstraint':'us-east-2'})
    # Creating Test instance
    s3_bucket_conn=S3BucketConnector(s3_access_key,s3_secret_key,s3_endpoint_url,s3_bucket_name)
    levels_etl=Levels_ETL(s3_bucket_conn)
    yield levels_etl
    # Teardown
    mock_bucket.stop()

@pytest.fixture
def levels_etl_with_test_csv_data(tmpdir_factory,levels_etl):
    filename=str(tmpdir_factory.mktemp('data').join('test_data.csv'))
    with open(filename,'w',encoding='UTF-8',newline='') as file:
        writer=csv.writer(file)
        writer.writerow(['date','company','location','title','level','specialisation','gender',
        'years_of_experience','years_at_company','base_salary','stock','bonus'])
        writer.writerows([['1/1/2017 11:33:27','Google','Sunnyvale, CA','Software Engineer','L3','android',
        'male','1','0','120000','40000','15000'],
        ['4/20/2017 11:33:27','Apple','Austin, TX','Software Engineer','ICT2','iOS Development','female','1','0',
        '90','30','20'],
        ['4/20/2017 11:33:27','Microsoft','Bellevue, WA','Product Manager','59','UX/UI','Male','0','0','0','0','0'],
        ['7/15/2017 11:33:27','Hubspot','Cambridge, MA, United States','Software Engineer','Junior',
        'Site Reliability (SRE)','','','','135','5','0'],
        ['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
        '11','2','215','100','40'],
        ['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
        '11','2','215','100','40'],
        ['12/11/2017 11:33:27','spotify','New York, NY','Software Engineer','Engineer 1','fullstack developer','male',
        '4','0','180','37.5','0'],
        ['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','augmented reality','male',
        '20','5','204','50','20'],
        ['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','virtual reality','male',
        '20','5','204','50','20'],
        ['3/30/2018 11:33:27','Netflix','Denver, CO','Software Engineer','E5','Web Development (front-end)','male',
        '20','2','591','0','0'],
        ['4/7/2018 11:33:27','Sony Interactive Entertainment','San Francisco, CA','Software Engineer','L4',
        'backend tools','male','6','6','103','5','32'],
        ['5/9/2018 11:33:27','Lyft','New York, NY','Data Scientist','t6','algorithms','male',
        '6','3','200','200','0'],
        ['11/11/2018 11:33:27','Hudson River Trading','New York, NY','Software Engineer','L4',
        'algorithm','male','6','4','431','0','1700'],
        ['4/7/2019 11:33:27','Facebook','Chicago, IL','Product Designer','IC4',
        'user experience','female','7','0','143','40','22.7'],
        ['4/7/2019 11:33:27','Facebook','New York, NY','Product Designer','IC4',
        'ux','female','7','2','173','40','0'],
        ['4/7/2019 11:33:27','Mango Voice','Salt Lake City, UT','Product Designer','l3',
        'ui','female','5','3','74.5','0','0'],
        ['9/13/2020 11:33:27','No Salary Startup','Chicago, IL','Product Designer','',
        'user interface','female','0','0','0','100','0'],
        ['4/7/2021 11:33:27','','Chicago, IL','','IC4','user experience','female','7','0','143','40','22.7'],
        ['4/7/2021 11:33:27','twitter','Washington, DC','software engineer','swe II',
        'data','male','2','2','150','60','0']])
    levels_etl.s3_bucket._bucket.upload_file(Filename=filename,Key='test_data.csv')
    yield levels_etl
    levels_etl.s3_bucket._bucket.delete_objects(Delete={
        'Objects':[
            {'Key':'test_data.csv'}
        ]
    })

Test Class Functions (2 of many)

def test_transform_job_data(self,levels_etl_with_test_csv_data):
        key_exp='test_data.csv'
        levels_etl_with_test_csv_data.transform_job_data(key=key_exp)
        jobdata_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='job_data.csv').get().get('Body').read().decode('UTF-8')
        print('jobdata_csv',jobdata_csv)
        job_data_df=pd.read_csv(StringIO(jobdata_csv))
        assert list(job_data_df.select_dtypes(include=['float']).columns)==['years_of_experience','years_at_company',
        'base_salary','stock','bonus']
        assert job_data_df.duplicated().any()==False
        assert ((job_data_df['base_salary']==0) & (job_data_df['stock']==0)).any()==False
        assert ((job_data_df['company']=='') & (job_data_df['title']=='')).any()==False
        assert job_data_df[job_data_df['company']=='Google']['base_salary'].values[0]==120000.00
        assert job_data_df[job_data_df['company']=='Google']['stock'].values[0]==40000.00
        assert job_data_df[job_data_df['company']=='Google']['bonus'].values[0]==15000.00
        assert job_data_df[job_data_df['company']=='Apple']['base_salary'].values[0]==90000.00
        assert job_data_df[job_data_df['company']=='Apple']['stock'].values[0]==30000.00
        assert job_data_df[job_data_df['company']=='Apple']['bonus'].values[0]==10000.00
    
    def test_transform_dates(self,levels_etl_with_test_csv_data):
        key_exp='test_data.csv'
        levels_etl_with_test_csv_data.transform_dates(key=key_exp)
        date_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='date.csv').get().get('Body').read().decode('UTF-8')
        print('date_csv',date_csv)
        date_df=pd.read_csv(StringIO(date_csv))
        assert list(date_df.columns)==['date','year','month','quarter']
        assert date_df['date'].tolist()==['2017-01-01','2017-04-20','2017-04-20','2017-07-15',
        '2017-10-11','2017-10-11','2017-12-11','2018-01-30','2018-01-30','2018-03-30','2018-04-07','2018-05-09',
        '2018-11-11','2019-04-07','2019-04-07','2019-04-07','2020-09-13','2021-04-07','2021-04-07']
        assert date_df['year'].tolist()==[2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,
        2019,2019,2019,2020,2021,2021]
        date_df['month'].tolist()==[1,4,4,7,10,10,12,1,1,3,4,5,11,4,4,4,9,4,4]
        assert date_df['quarter'].tolist()==[1,2,2,3,4,4,4,1,1,1,2,2,4,2,2,2,3,2,2]

The transform_job_data and transform_dates functions both retrieve the 'test_data.csv' file from the S3 bucket, apply pandas dataframe transformations and then convert back to CSV and upload new CSV to S3.

With the first test I get the expected CSV output:

jobdata_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0

But for the second one, it appends to the CSV from the prior test instead of creating CSV with date, year, month and quarter columns:

date_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0 date,year,month,quarter 2017-01-01,2017,1,1 2017-04-20,2017,4,2 2017-04-20,2017,4,2 2017-07-15,2017,7,3 2017-10-11,2017,10,4 2017-10-11,2017,10,4 2017-12-11,2017,12,4 2018-01-30,2018,1,1 2018-01-30,2018,1,1 2018-03-30,2018,3,1 2018-04-07,2018,4,2 2018-05-09,2018,5,2 2018-11-11,2018,11,4 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2020-09-13,2020,9,3 2021-04-07,2021,4,2 2021-04-07,2021,4,2

I have tried modifying the scopes of the pytest fixtures between class, session and function but I am not getting the desired result. I added teardown code that deletes the 'test_data.csv' object after each test in the levels_etl_with_test_csv_data fixture but that has had no impact either.

Where is my issue coming from?

继续阅读：amazon-s3 csv python

How to stop Pytest from appending to CSV created after initial test

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？