Python merger multiple CSV files to netCDF#

Software requirements:

  • Python 3

  • numpy

  • pandas

  • xarray

Example script#

read_multiple_csv_files_convert_to_netcdf.py

#!/usr/bin/env python
# coding: utf-8
'''
DKRZ example

Read and merge multiple CSV files - write netCDF file

Content

- generate multiple random data files
- assign time coordinate
- read multiple CSV files
  - get lat and lon coordinate values from file names
- merge coordinate variables and data to Xarray.Dataset
- write Xarray.Dataset to netCDF file

Data:         random data
File names:   ./data/test_data_*
Output file:  ./test_data_1961_01.nc

-------------------------------------------------------------------------------
2022 copyright DKRZ licensed under CC BY-NC-SA 4.0
               (https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en)
-------------------------------------------------------------------------------
'''
import os, glob
import pandas as pd
import numpy as np
import xarray as xr

#-- Generate random data files
#
# The coordinate information for each file is used to define the variable name.
# The CSV file is a simple a simple ASCII file with one columns data, the rows
# stand for the time steps.
#
# E.g. ./data/test_data_53.00_9.00
#
# 2.9215954594021842
# 9.309742116752075
# 4.793432399161088
# 2.385667109912153
# ...
def generate_random_data():
    if not os.path.exists('./random_data'):
        os.makedirs('./random_data')
    lon = np.arange(9., 12.25, 0.25)
    lat = np.arange(53., 55.25, 0.25)
    time = pd.date_range('1961-01-01T12:00:00', '1961-01-31T12:00:00')
    for i in range(0,len(lat)):
        for j in range(0,len(lon)):
            rdata = pd.DataFrame(np.random.uniform(0., 10., time.size))
            fname = f'./random_data/test_data_{lat[i]:.2f}_{lon[j]:.2f}'
            rdata.to_csv(fname, index=False, header=False)

#-- main
def main():
    # Generate the multiple random data files
    generate_random_data()

    # Path to generated files
    infiles = sorted(glob.glob(os.environ['HOME']+'/Python/ASCII/data/test_data_*'))

    # Assign time coordinate. Define the time coordinate variable.
    start_date = '1961-01-01T12:00:00'
    end_date   = '1961-01-31T12:00:00'

    time = pd.date_range(start_date, end_date)

    # Read multiple CSV files
    #
    # Get the latitude and longitude values for each file name and read the file
    # content into a separate variable.
    lat_array, lon_array = [], []

    for i,f in enumerate(infiles):
        dfi = pd.read_csv(f, header=None)

        lat = float(f.split(sep='_')[2])
        lon = float(f.split(sep='_')[3])
        lat_array.append(float(lat))
        lon_array.append(float(lon))

        data = np.zeros((len(time), 1, 1))
        data[:,0,0] = dfi[0]

        locals()[f'var_{i}'] = xr.DataArray(data,
                                            dims=['time', 'lat', 'lon'],
                                            coords={'time':time,
                                                    'lat':np.array([lat]),
                                                    'lon':np.array([lon])},
                                            name='var')

    # To generate the latitude and longitude arrays for the grid coordinates we
    # have to get their unique values.
    lat_array = sorted(list(set(lat_array)))
    lon_array = sorted(list(set(lon_array)))

    # Merge coordinate variables and data
    #
    # Combine/merge the multiple var_* variables into one data array.
    combine_data = np.zeros((len(time), len(lat_array), len(lon_array)))

    count = 0
    for i in range(len(lat_array)):
        for j in range(len(lon_array)):
            d = locals()[f'var_{count}']
            combine_data[:,i,j] = d[:,0,0]
            count += 1

    # Assign data to Xarray.DataArray and afterwards convert it to a Xarray.Dataset.
    data = xr.DataArray(data=combine_data.data,
                        dims=["time", "lat", "lon"],
                        coords=dict(time=time.values,
                                    lat=(["lat"], lat_array),
                                    lon=(["lon"], lon_array)),
                        attrs=dict(description="Random data variable",
                                   units="units",),
                     )
    ds = data.to_dataset(name='rvar')

    # Now that we have created the dataset, we can directly use Xarray's .to_netcdf()
    # method to write the dataset to a netCDF file. Unfortunately the time coordinate
    # is written in int64 and not as usual in float64 (double). Within the Python
    # world this is fine, but leaving it can lead to difficulties. Therefore we
    # have to set the units and the calendar explicit with the .encoding() method
    # to make it work. We also add a few more attributes to the data variable and
    # the grid coordinates.
    ds.time.encoding['units'] = "days since 1961-01-01 00:00:00"
    ds.time.encoding['calendar'] = "proleptic_gregorian"

    ds.rvar.attrs = {'standard_name':'random_variable', 'units':'units', 'coordinates':'lat lon'}
    ds.lat.attrs = {'standard_name':'latitude', 'units':'degrees_north'}
    ds.lon.attrs = {'standard_name':'longitude', 'units':'degrees_east'}

    # Write dataset to netCDF file
    ds.to_netcdf('./test_data_1961_01.nc', engine='netcdf4', mode='a')

    # Delete folder random_data
    os.system('rm -rf random_data')
    
    
if __name__ == "__main__":
    main()