Python matplotlib example merger multiple CSV files to netCDF#

Software requirements:

  • Python 3

  • numpy

  • pandas

  • xarray

Run the example script:

python read_multiple_csv_files_convert_to_netcdf.py

Script read_multiple_csv_files_convert_to_netcdf.py:

#!/usr/bin/env python
# coding: utf-8
#
# DKRZ example
#
# Read and merge multiple CSV files - write netCDF file
#
# Content
#
# - generate random data files
# - assign time coordinate
# - read multiple CSV files
#   - get lat and lon coordinate values from file names
# - merge coordinate variables and data to Xarray.Dataset
# - write Xarray.Dataset to netCDF file
#
# Data:         random data
# File names:   ./data/test_data_*
# Output file:  ./test_data_1961_01.nc
#
# 2023 copyright DKRZ, kmf
#
import os, glob
import pandas as pd
import numpy as np
import xarray as xr

# Generate random data files
#
# The coordinate information for each file is used to define the variable name.
# The CSV file is a simple a simple ASCII file with one columns data, the rows
# stand for the time steps.
#
# E.g. ./data/test_data_53.00_9.00
#
# 2.9215954594021842
# 9.309742116752075
# 4.793432399161088
# 2.385667109912153
# ...

def generate_random_data():
    if not os.path.exists('./data'):
        os.makedirs('./data')
    lon = np.arange(9., 12.25, 0.25)
    lat = np.arange(53., 55.25, 0.25)
    time = pd.date_range('1961-01-01T12:00:00', '1961-01-31T12:00:00')
    for i in range(0,len(lat)):
        for j in range(0,len(lon)):
            rdata = pd.DataFrame(np.random.uniform(0., 10., time.size))
            fname = f'./data/test_data_{lat[i]:.2f}_{lon[j]:.2f}'
            rdata.to_csv(fname, index=False, header=False)

def main():

    #-- generate random data
    generate_random_data()

    infiles = sorted(glob.glob(os.environ['HOME']+'/Python/ASCII/data/test_data_*'))

    # Assign time coordinate
    #
    # Define the time coordinate variable.
    start_date = '1961-01-01T12:00:00'
    end_date = '1961-01-31T12:00:00'

    time = pd.date_range(start_date, end_date)

    # Read multiple CSV files
    #
    # Get the latitude and longitude values for each file name and read the file
    # content into a separate variable.
    lat_array, lon_array = [], []

    for i,f in enumerate(infiles):
        dfi = pd.read_csv(f, header=None)

        lat = float(f.split(sep='_')[2])
        lon = float(f.split(sep='_')[3])
        lat_array.append(float(lat))
        lon_array.append(float(lon))

        data = np.zeros((len(time), 1, 1))
        data[:,0,0] = dfi[0]

        locals()[f'var_{i}'] = xr.DataArray(data,
                                            dims=['time', 'lat', 'lon'],
                                            coords={'time':time,
                                                    'lat':np.array([lat]),
                                                    'lon':np.array([lon])},
                                            name='var')

    # To generate the latitude and longitude arrays for the grid coordinates we
    # have to get their unique values.
    lat_array = sorted(list(set(lat_array)))
    lon_array = sorted(list(set(lon_array)))

    # Merge coordinate variables and data
    #
    # Combine/merge the multiple var_* variables into one data array.
    combine_data = np.zeros((len(time), len(lat_array), len(lon_array)))

    count = 0
    for i in range(len(lat_array)):
        for j in range(len(lon_array)):
            d = locals()[f'var_{count}']
            combine_data[:,i,j] = d[:,0,0]
            count += 1

    # Assign data to Xarray.DataArray and afterwards convert it to a Xarray.Dataset.
    data = xr.DataArray(data=combine_data.data,
                        dims=["time", "lat", "lon"],
                        coords=dict(time=time.values,
                                    lat=(["lat"], lat_array),
                                    lon=(["lon"], lon_array)),
                        attrs=dict(description="Random data variable",
                                   units="units",),
                     )
    ds = data.to_dataset(name='rvar')

    # Now that we have created the dataset, we can directly use Xarray's .to_netcdf()
    # method to write the dataset to a netCDF file. Unfortunately the time coordinate
    # is written in int64 and not as usual in float64 (double). Within the Python
    # world this is fine, but leaving it can lead to difficulties. Therefore we
    # have to set the units and the calendar explicit with the .encoding() method
    # to make it work. We also add a few more attributes to the data variable and
    # the grid coordinates.
    ds.time.encoding['units'] = "days since 1961-01-01 00:00:00"
    ds.time.encoding['calendar'] = "proleptic_gregorian"

    ds.rvar.attrs = {'standard_name':'random_variable', 'units':'units', 'coordinates':'lat lon'}
    ds.lat.attrs = {'standard_name':'latitude', 'units':'degrees_north'}
    ds.lon.attrs = {'standard_name':'longitude', 'units':'degrees_east'}

    # Write dataset to netCDF file
    #
    # Delete output file if exists and write the dataset to the netCDF output file.
    try:
        os.remove('./test_data_1961_01.nc')
    except OSError:
        pass

    ds.to_netcdf('./test_data_1961_01.nc', engine='netcdf4')


if __name__ == "__main__":
    main()