Source code for Corrfunc.io

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Routines to read galaxy catalogs from disk.
Updated with lognormal scripts!
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from os.path import dirname, abspath, splitext, exists as file_exists,\
    join as pjoin
import numpy as np
import struct
try:
    import pandas as pd
except ImportError:
    pd = None


__all__ = ('read_fastfood_catalog', 'read_ascii_catalog', 'read_catalog', 'read_fortran_catalog', 'read_lognormal_catalog')


[docs]def read_fastfood_catalog(filename, return_dtype=None, need_header=None): """ Read a galaxy catalog from a fast-food binary file. Parameters ----------- filename: string Filename containing the galaxy positions return_dtype: numpy dtype for returned arrays. Default ``numpy.float`` Specifies the datatype for the returned arrays. Must be in {np.float, np.float32} need_header: boolean, default None. Returns the header found in the fast-food file in addition to the X/Y/Z arrays. Returns -------- X, Y, Z: numpy arrays Returns the triplet of X/Y/Z positions as separate numpy arrays. If need_header is set, then the header is also returned Example -------- >>> import numpy as np >>> from os.path import dirname, abspath, join as pjoin >>> import Corrfunc >>> from Corrfunc.io import read_fastfood_catalog >>> filename = pjoin(dirname(abspath(Corrfunc.__file__)), ... "../theory/tests/data/", ... "gals_Mr19.ff") >>> X, Y, Z = read_fastfood_catalog(filename) >>> N = 20 >>> for x,y,z in zip(X[0:N], Y[0:N], Z[0:]): ... print("{0:10.5f} {1:10.5f} {2:10.5f}".format(x, y, z)) ... # doctest: +NORMALIZE_WHITESPACE 419.94550 1.96340 0.01610 419.88272 1.79736 0.11960 0.32880 10.63620 4.16550 0.15314 10.68723 4.06529 0.46400 8.91150 6.97090 6.30690 9.77090 8.61080 5.87160 9.65870 9.29810 8.06210 0.42350 4.89410 11.92830 4.38660 4.54410 11.95543 4.32622 4.51485 11.65676 4.34665 4.53181 11.75739 4.26262 4.31666 11.81329 4.27530 4.49183 11.80406 4.54737 4.26824 12.61570 4.14470 3.70140 13.23640 4.34750 5.26450 13.19833 4.33196 5.29435 13.21249 4.35695 5.37418 13.06805 4.24275 5.35126 13.19693 4.37618 5.28772 """ if return_dtype is None: return_dtype = np.float if return_dtype not in [np.float32, np.float]: msg = "Return data-type must be set and a valid numpy float" raise ValueError(msg) if not file_exists(filename): msg = "Could not find file = {0}".format(filename) raise IOError(msg) import struct try: from future.utils import bytes_to_native_str except ImportError: print("\n\tPlease run python -m pip install Corrfunc before using " "the 'Corrfunc' package\n") raise with open(filename, "rb") as f: skip1 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] idat = struct.unpack(bytes_to_native_str(b'@iiiii'), f.read(20))[0:5] skip2 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] assert skip1 == 20 and skip2 == 20,\ "fast-food file seems to be incorrect (reading idat)" ngal = idat[1] if need_header is not None: # now read fdat skip1 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] fdat = struct.unpack(bytes_to_native_str(b'@fffffffff'), f.read(36))[0:9] skip2 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] assert skip1 == 36 and skip2 == 36,\ "fast-food file seems to be incorrect (reading fdat )" skip1 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] znow = struct.unpack(bytes_to_native_str(b'@f'), f.read(4))[0] skip2 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] assert skip1 == 4 and skip2 == 4,\ "fast-food file seems to be incorrect (reading redshift)" else: fdat_bytes = 4 + 36 + 4 znow_bytes = 4 + 4 + 4 # seek over the fdat + znow fields + padding bytes # from current position f.seek(fdat_bytes + znow_bytes, 1) # read the padding bytes for the x-positions skip1 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] assert skip1 == ngal * 4 or skip1 == ngal * 8, \ "fast-food file seems to be corrupt (padding bytes)" # seek back 4 bytes from current position f.seek(-4, 1) pos = {} for field in 'xyz': skip1 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] assert skip1 == ngal * 4 or skip1 == ngal * 8, \ "fast-food file seems to be corrupt (padding bytes a)" # the next division must be the integer division input_dtype = np.float32 if skip1 // ngal == 4 else np.float array = np.fromfile(f, input_dtype, ngal) skip2 = struct.unpack(bytes_to_native_str(b'@i'), f.read(4))[0] if return_dtype == input_dtype: pos[field] = array else: pos[field] = [return_dtype(a) for a in array] x = np.array(pos['x']) y = np.array(pos['y']) z = np.array(pos['z']) if need_header is not None: return idat, fdat, znow, x, y, z else: return x, y, z
[docs]def read_ascii_catalog(filename, return_dtype=None): """ Read a galaxy catalog from an ascii file. Parameters ----------- filename: string Filename containing the galaxy positions return_dtype: numpy dtype for returned arrays. Default ``numpy.float`` Specifies the datatype for the returned arrays. Must be in {np.float, np.float32} Returns -------- X, Y, Z: numpy arrays Returns the triplet of X/Y/Z positions as separate numpy arrays. Example -------- >>> from __future__ import print_function >>> from os.path import dirname, abspath, join as pjoin >>> import Corrfunc >>> from Corrfunc.io import read_ascii_catalog >>> filename = pjoin(dirname(abspath(Corrfunc.__file__)), ... "../mocks/tests/data/", "Mr19_mock_northonly.rdcz.dat") >>> ra, dec, cz = read_ascii_catalog(filename) >>> N = 20 >>> for r,d,c in zip(ra[0:N], dec[0:N], cz[0:]): ... print("{0:10.5f} {1:10.5f} {2:10.5f}".format(r, d, c)) ... # doctest: +NORMALIZE_WHITESPACE 178.45087 67.01112 19905.28514 178.83495 67.72519 19824.02285 179.50132 67.67628 19831.21553 182.75497 67.13004 19659.79825 186.29853 68.64099 20030.64412 186.32346 68.65879 19763.38137 187.36173 68.15151 19942.66996 187.20613 68.56189 19996.36607 185.56358 67.97724 19729.32308 183.27930 67.11318 19609.71345 183.86498 67.82823 19500.44130 184.07771 67.43429 19440.53790 185.13370 67.15382 19390.60304 189.15907 68.28252 19858.85853 190.12209 68.55062 20044.29744 193.65245 68.36878 19445.62469 194.93514 68.34870 19158.93155 180.36897 67.50058 18671.40780 179.63278 67.51318 18657.59191 180.75742 67.95530 18586.88913 """ if return_dtype is None: return_dtype = np.float if not file_exists(filename): msg = "Could not find file = {0}".format(filename) raise IOError(msg) # check if pandas is available - much faster to read in the data # using pandas if pd is not None: df = pd.read_csv(filename, header=None, engine="c", dtype={"x": return_dtype, "y": return_dtype, "z": return_dtype}, delim_whitespace=True) x = np.asarray(df[0], dtype=return_dtype) y = np.asarray(df[1], dtype=return_dtype) z = np.asarray(df[2], dtype=return_dtype) else: x, y, z, _ = np.genfromtxt(filename, dtype=return_dtype, unpack=True) return x, y, z
[docs]def read_fortran_catalog(filename, return_dtype=None): if return_dtype is None: return_dtype = np.float with open(filename, mode='rb') as file: # b is important -> binary fileContent = file.read() nleading = 3*8+1*4 header = struct.unpack("dddi", fileContent[:nleading]) Lx, Ly, Lz, N = header data = struct.unpack("f" * ((len(fileContent) -nleading) // 4), fileContent[nleading:]) data = np.array(data, dtype=return_dtype) data = data.reshape((-1, 6)) x, y, z, *_ = data.T return x, y, z
[docs]def read_lognormal_catalog(n='2e-4'): filedir = '../theory/tests/data' filepath = pjoin(dirname(abspath(__file__)), filedir, f'cat_L750_n{n}_z057_patchy_lognormal_rlz0.bin') return read_fortran_catalog(filepath)
[docs]def read_catalog(filebase=None, return_dtype=np.float): """ Reads a galaxy/randoms catalog and returns 3 XYZ arrays. Parameters ----------- filebase: string (optional) The fully qualified path to the file. If omitted, reads the theory galaxy catalog under ../theory/tests/data/ return_dtype: numpy dtype for returned arrays. Default ``numpy.float`` Specifies the datatype for the returned arrays. Must be in {np.float, np.float32} Returns -------- ``x y z`` - Unpacked numpy arrays compatible with the installed version of ``Corrfunc``. .. note:: If the filename is omitted, then first the fast-food file is searched for, and then the ascii file. End-users should always supply the full filename. """ if filebase is None: filename = pjoin(dirname(abspath(__file__)), "../theory/tests/data/", "gals_Mr19") allowed_exts = {'.ff': read_fastfood_catalog, '.txt': read_ascii_catalog, '.dat': read_ascii_catalog, '.csv': read_ascii_catalog, '.bin': read_fortran_catalog } for e in allowed_exts: if file_exists(filename + e): f = allowed_exts[e] x, y, z = f(filename + e, return_dtype) return x, y, z raise IOError("Could not locate {0} with any of these extensions \ = {1}".format(filename, allowed_exts.keys())) else: # Likely an user-supplied value if file_exists(filebase): extension = splitext(filebase)[1] if '.ff' in extension: f = read_fastfood_catalog elif '.bin' in extension: f = read_fortran_catalog else: f = read_ascii_catalog # default return is double x, y, z = f(filebase, return_dtype) return x, y, z raise IOError("Could not locate file {0}".format(filebase))
if __name__ == '__main__': import doctest doctest.testmod(verbose=True)