In [2]:
import numpy as np
import pandas as pd
In [3]:
# make a 1million x 10 dataframe with nans interspersed

arr = np.random.randn(int(1e6)) 
cols = {f'column_{i}': arr for i in range(10)}
df = pd.DataFrame(cols)
df.tail()
Out[3]:
column_0 column_1 column_2 column_3 column_4 column_5 column_6 column_7 column_8 column_9
999995 1.103789 1.103789 1.103789 1.103789 1.103789 1.103789 1.103789 1.103789 1.103789 1.103789
999996 2.860796 2.860796 2.860796 2.860796 2.860796 2.860796 2.860796 2.860796 2.860796 2.860796
999997 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571 -0.323571
999998 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933 -1.371933
999999 0.389326 0.389326 0.389326 0.389326 0.389326 0.389326 0.389326 0.389326 0.389326 0.389326
In [4]:
%time df.to_feather('test_df.feather')
Wall time: 93.8 ms
In [5]:
%time df.to_csv('test_df.gzip.csv', compression='gzip')
Wall time: 33 s