Python merger multiple CSV files to netCDF#
Software requirements:
Python 3
numpy
pandas
xarray
Example script#
read_multiple_csv_files_convert_to_netcdf.py
#!/usr/bin/env python
# coding: utf-8
'''
DKRZ example
Read and merge multiple CSV files - write netCDF file
Content
- generate multiple random data files
- assign time coordinate
- read multiple CSV files
- get lat and lon coordinate values from file names
- merge coordinate variables and data to Xarray.Dataset
- write Xarray.Dataset to netCDF file
Data: random data
File names: ./data/test_data_*
Output file: ./test_data_1961_01.nc
-------------------------------------------------------------------------------
2022 copyright DKRZ licensed under CC BY-NC-SA 4.0
(https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en)
-------------------------------------------------------------------------------
'''
import os, glob
import pandas as pd
import numpy as np
import xarray as xr
#-- Generate random data files
#
# The coordinate information for each file is used to define the variable name.
# The CSV file is a simple a simple ASCII file with one columns data, the rows
# stand for the time steps.
#
# E.g. ./data/test_data_53.00_9.00
#
# 2.9215954594021842
# 9.309742116752075
# 4.793432399161088
# 2.385667109912153
# ...
def generate_random_data():
if not os.path.exists('./random_data'):
os.makedirs('./random_data')
lon = np.arange(9., 12.25, 0.25)
lat = np.arange(53., 55.25, 0.25)
time = pd.date_range('1961-01-01T12:00:00', '1961-01-31T12:00:00')
for i in range(0,len(lat)):
for j in range(0,len(lon)):
rdata = pd.DataFrame(np.random.uniform(0., 10., time.size))
fname = f'./random_data/test_data_{lat[i]:.2f}_{lon[j]:.2f}'
rdata.to_csv(fname, index=False, header=False)
#-- main
def main():
# Generate the multiple random data files
generate_random_data()
# Path to generated files
infiles = sorted(glob.glob(os.environ['HOME']+'/Python/ASCII/data/test_data_*'))
# Assign time coordinate. Define the time coordinate variable.
start_date = '1961-01-01T12:00:00'
end_date = '1961-01-31T12:00:00'
time = pd.date_range(start_date, end_date)
# Read multiple CSV files
#
# Get the latitude and longitude values for each file name and read the file
# content into a separate variable.
lat_array, lon_array = [], []
for i,f in enumerate(infiles):
dfi = pd.read_csv(f, header=None)
lat = float(f.split(sep='_')[2])
lon = float(f.split(sep='_')[3])
lat_array.append(float(lat))
lon_array.append(float(lon))
data = np.zeros((len(time), 1, 1))
data[:,0,0] = dfi[0]
locals()[f'var_{i}'] = xr.DataArray(data,
dims=['time', 'lat', 'lon'],
coords={'time':time,
'lat':np.array([lat]),
'lon':np.array([lon])},
name='var')
# To generate the latitude and longitude arrays for the grid coordinates we
# have to get their unique values.
lat_array = sorted(list(set(lat_array)))
lon_array = sorted(list(set(lon_array)))
# Merge coordinate variables and data
#
# Combine/merge the multiple var_* variables into one data array.
combine_data = np.zeros((len(time), len(lat_array), len(lon_array)))
count = 0
for i in range(len(lat_array)):
for j in range(len(lon_array)):
d = locals()[f'var_{count}']
combine_data[:,i,j] = d[:,0,0]
count += 1
# Assign data to Xarray.DataArray and afterwards convert it to a Xarray.Dataset.
data = xr.DataArray(data=combine_data.data,
dims=["time", "lat", "lon"],
coords=dict(time=time.values,
lat=(["lat"], lat_array),
lon=(["lon"], lon_array)),
attrs=dict(description="Random data variable",
units="units",),
)
ds = data.to_dataset(name='rvar')
# Now that we have created the dataset, we can directly use Xarray's .to_netcdf()
# method to write the dataset to a netCDF file. Unfortunately the time coordinate
# is written in int64 and not as usual in float64 (double). Within the Python
# world this is fine, but leaving it can lead to difficulties. Therefore we
# have to set the units and the calendar explicit with the .encoding() method
# to make it work. We also add a few more attributes to the data variable and
# the grid coordinates.
ds.time.encoding['units'] = "days since 1961-01-01 00:00:00"
ds.time.encoding['calendar'] = "proleptic_gregorian"
ds.rvar.attrs = {'standard_name':'random_variable', 'units':'units', 'coordinates':'lat lon'}
ds.lat.attrs = {'standard_name':'latitude', 'units':'degrees_north'}
ds.lon.attrs = {'standard_name':'longitude', 'units':'degrees_east'}
# Write dataset to netCDF file
ds.to_netcdf('./test_data_1961_01.nc', engine='netcdf4', mode='a')
# Delete folder random_data
os.system('rm -rf random_data')
if __name__ == "__main__":
main()