One of the most valuable tools provided by
pandas
is the ability to load data from a great variety of file formats and databases, like
SQL
(
*.db
),
Excel
(
*.xls
), and comma-separated values (
*.csv
) files.
General Usage
import pandas as pd
dataPath = 'https://raw.githubusercontent.com/alineu/pyDataScintist-Notebooks/main/data/'
df = pd.read_csv(dataPath+'BostonHousing_min.csv')
df.head()
|
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
Useful Arguments
sep
: Delimiter to use - the default value is ','
# Not specifying the delimiter when reading a tsv file is problematic
pd.read_csv(dataPath+'BostonHousing_min.tsv').head()
|
crim\t"zn"\t"indus"\t"chas"\t"nox"\t"rm"\t"age"\t"dis"\t"rad"\t"tax"\t"ptratio"\t"b"\t"lstat"\t"medv" |
0 |
0.00632\t18\t2.31\t"0"\t0.538\t6.575\t65.2\t4.... |
1 |
0.02731\t0\t7.07\t"0"\t0.469\t6.421\t78.9\t4.9... |
2 |
0.02729\t0\t7.07\t"0"\t0.469\t7.185\t61.1\t4.9... |
3 |
0.03237\t0\t2.18\t"0"\t0.458\t6.998\t45.8\t6.0... |
4 |
0.06905\t0\t2.18\t"0"\t0.458\t7.147\t54.2\t6.0... |
pd.read_csv(dataPath+'BostonHousing_min.tsv',
sep='\t').head()
|
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
Note: sep
accepts regular expressions as well e.g. sep='\s{2}\t'
header
: specifies what row to start reading the csv - the deafult value is 0
# header=0 is the default behavior
pd.read_csv(dataPath+'BostonHousing_min.csv',
header=0).head()
|
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
# header=n where n is an integer reads the file starting at the nth line
pd.read_csv(dataPath+'BostonHousing_min.csv',
header=3).head()
|
0.02729 |
0 |
7.07 |
0.1 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
0 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
1 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
2 |
0.02985 |
0.0 |
2.18 |
0 |
0.458 |
6.430 |
58.7 |
6.0622 |
3 |
222 |
18.7 |
394.12 |
5.21 |
28.7 |
3 |
0.08829 |
12.5 |
7.87 |
0 |
0.524 |
6.012 |
66.6 |
5.5605 |
5 |
311 |
15.2 |
395.60 |
12.43 |
22.9 |
4 |
0.14455 |
12.5 |
7.87 |
0 |
0.524 |
6.172 |
96.1 |
5.9505 |
5 |
311 |
15.2 |
396.90 |
19.15 |
27.1 |
Note: header=
None
treats the first line of the file as data, and the column names will be assigned as a list of integers
pd.read_csv(dataPath+'BostonHousing_min.csv',
header=None).head()
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
0 |
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
1 |
0.00632 |
18 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.09 |
1 |
296 |
15.3 |
396.9 |
4.98 |
24 |
2 |
0.02731 |
0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.9 |
9.14 |
21.6 |
3 |
0.02729 |
0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
4 |
0.03237 |
0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
# So header=None is convenient if the file does not have a header
pd.read_csv(dataPath+'BostonHousing_no-header_min.csv',
header=None).head()
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
prefix
: string to be prepended to the column names. The common use is when the file has no header so the specified prefix is prepended to the integers that have automatically been assigned as column names.
pd.read_csv(dataPath+'BostonHousing_no-header_min.csv',
prefix='col_').head()
|
col_0 |
col_1 |
col_2 |
col_3 |
col_4 |
col_5 |
col_6 |
col_7 |
col_8 |
col_9 |
col_10 |
col_11 |
col_12 |
col_13 |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
skiprows
: can be used as
- a
list
to specify the line numbers to be skipped when reading a file, e.g. skiprows
=[1, 5, 8]
- an
int
that specifies where to start reading the file, e.g. skiprows
=5
. skiprows
=n
and header
=n
produce the same effect.
- a callable function which is evaluated against the row indices e.g.
skiprows
=
lambda
x: x%3==2
pd.read_csv(dataPath+'BostonHousing.csv',
skiprows=range(2,20)).head()
|
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.72580 |
0.0 |
8.14 |
0 |
0.538 |
5.727 |
69.5 |
3.7965 |
4 |
307 |
21.0 |
390.95 |
11.28 |
18.2 |
2 |
1.25179 |
0.0 |
8.14 |
0 |
0.538 |
5.570 |
98.1 |
3.7979 |
4 |
307 |
21.0 |
376.57 |
21.02 |
13.6 |
3 |
0.85204 |
0.0 |
8.14 |
0 |
0.538 |
5.965 |
89.2 |
4.0123 |
4 |
307 |
21.0 |
392.53 |
13.83 |
19.6 |
4 |
1.23247 |
0.0 |
8.14 |
0 |
0.538 |
6.142 |
91.7 |
3.9769 |
4 |
307 |
21.0 |
396.90 |
18.72 |
15.2 |
pd.read_csv(dataPath+'BostonHousing_min.csv',
skiprows=lambda x: x%3==2).head()
|
crim |
zn |
indus |
chas |
nox |
rm |
age |
dis |
rad |
tax |
ptratio |
b |
lstat |
medv |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
2 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
3 |
0.02985 |
0.0 |
2.18 |
0 |
0.458 |
6.430 |
58.7 |
6.0622 |
3 |
222 |
18.7 |
394.12 |
5.21 |
28.7 |
4 |
0.08829 |
12.5 |
7.87 |
0 |
0.524 |
6.012 |
66.6 |
5.5605 |
5 |
311 |
15.2 |
395.60 |
12.43 |
22.9 |
names
: List of column names to use. If the file contains a header row, then you should explicitly pass header
=0
to override the column names. Duplicates in this list are not allowed.
col_names = [f'col_{i}' for i in range(len(df.columns))]
col_names
['col_0',
'col_1',
'col_2',
'col_3',
'col_4',
'col_5',
'col_6',
'col_7',
'col_8',
'col_9',
'col_10',
'col_11',
'col_12',
'col_13']
pd.read_csv(dataPath+'BostonHousing_min.csv',
names=col_names,
header=0).head()
|
col_0 |
col_1 |
col_2 |
col_3 |
col_4 |
col_5 |
col_6 |
col_7 |
col_8 |
col_9 |
col_10 |
col_11 |
col_12 |
col_13 |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
Note: names
is handy if you want to override a file that has header while prefix
is often used to modify the column names of a file with no header.
usecols
: List of column names or column ids to be included in the DataFrame
.
# Using column names
pd.read_csv(dataPath+'BostonHousing_min.csv',
usecols=['age', 'tax']).head()
|
age |
tax |
0 |
65.2 |
296 |
1 |
78.9 |
242 |
2 |
61.1 |
242 |
3 |
45.8 |
222 |
4 |
54.2 |
222 |
# Using column ids
pd.read_csv(dataPath+'BostonHousing_min.csv',
usecols=[0, 5]).head()
|
crim |
rm |
0 |
0.00632 |
6.575 |
1 |
0.02731 |
6.421 |
2 |
0.02729 |
7.185 |
3 |
0.03237 |
6.998 |
4 |
0.06905 |
7.147 |
nrows
: Number of lines to read. It is useful when reading pieces of a large file.
pd.read_csv(dataPath+'BostonHousing_min.csv',
nrows=2,
usecols=[0, 5]).head()
|
crim |
rm |
0 |
0.00632 |
6.575 |
1 |
0.02731 |
6.421 |
dtype
: a dictionary
that enables the user to specify the data type. Otherwise pandas
will automatically infer the data.
df = pd.read_csv(dataPath+'BostonHousing_min.csv',
usecols=['age', 'tax'])
df.dtypes
age float64
tax int64
dtype: object
df.head()
|
age |
tax |
0 |
65.2 |
296 |
1 |
78.9 |
242 |
2 |
61.1 |
242 |
3 |
45.8 |
222 |
4 |
54.2 |
222 |
# Change the "tax" column data type from int to float
df = pd.read_csv(dataPath+'BostonHousing_min.csv',
usecols=['age', 'tax'],
dtype={'tax': float})
df.dtypes
age float64
tax float64
dtype: object
# Note the floating point in the "tax" column
df.head()
|
age |
tax |
0 |
65.2 |
296.0 |
1 |
78.9 |
242.0 |
2 |
61.1 |
242.0 |
3 |
45.8 |
222.0 |
4 |
54.2 |
222.0 |