Source code for dtale.correlations
import numpy as np
import pandas as pd
import dtale.global_state as global_state
from dtale.code_export import build_code_export
from dtale.utils import classify_type, dict_merge
[docs]def get_col_groups(data_id, data):
valid_corr_cols = []
valid_str_corr_cols = []
valid_date_cols = []
for col_info in global_state.get_dtypes(data_id):
name, dtype = map(col_info.get, ["name", "dtype"])
dtype = classify_type(dtype)
if dtype in ["I", "F"]:
valid_corr_cols.append(name)
elif dtype == "S" and col_info.get("unique_ct", 0) <= 50:
valid_str_corr_cols.append(name)
elif dtype == "D":
# even if a datetime column exists, we need to make sure that there is enough data for a date
# to warrant a correlation, https://github.com/man-group/dtale/issues/43
date_counts = data[name].dropna().value_counts()
if len(date_counts[date_counts > 1]) > 1:
valid_date_cols.append(dict(name=name, rolling=False))
elif date_counts.eq(1).all():
valid_date_cols.append(dict(name=name, rolling=True))
return valid_corr_cols, valid_str_corr_cols, valid_date_cols
[docs]def build_matrix(data_id, data, cols, code_formatting_vars=None):
if data[cols].isnull().values.any():
data = data.corr(method="pearson")
code = build_code_export(data_id)
code.append(
(
"corr_cols = [\n"
"\t'{corr_cols}'\n"
"]\n"
"corr_data = df[corr_cols]\n"
"{str_encodings}"
"corr_data = corr_data.corr(method='pearson')"
).format(
**dict_merge(
{"corr_cols": "", "str_encodings": ""}, code_formatting_vars
)
)
)
else:
# using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
# https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
data = np.corrcoef(data[cols].values, rowvar=False)
data = pd.DataFrame(data, columns=cols, index=cols)
code = build_code_export(
data_id, imports="import numpy as np\nimport pandas as pd\n\n"
)
code.append(
(
"corr_cols = [\n"
"\t'{corr_cols}'\n"
"]\n"
"corr_data = df[corr_cols]\n"
"{str_encodings}"
"corr_data = np.corrcoef(corr_data.values, rowvar=False)\n"
"corr_data = pd.DataFrame(corr_data, columns=[corr_cols], index=[corr_cols])"
).format(
**dict_merge(
{"corr_cols": "", "str_encodings": ""}, code_formatting_vars
)
)
)
code = "\n".join(code)
return data, code
[docs]def get_analysis(data_id):
df = global_state.get_data(data_id)
valid_corr_cols, _, _ = get_col_groups(data_id, df)
corr_matrix, _ = build_matrix(
data_id, df, valid_corr_cols, {"corr_cols": "", "str_encodings": ""}
)
corr_matrix = corr_matrix.abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
score = upper.max(axis=1)
score.name = "score"
score = score.sort_values(ascending=False)
upper = upper.loc[score.index]
column_name = upper.index[0]
max_score = score.loc[column_name]
if pd.isnull(max_score):
max_score = "N/A"
upper = upper.fillna(0).to_dict(orient="index")
missing = df[valid_corr_cols].isna().sum()
missing.name = "missing"
analysis = pd.concat([score, missing], axis=1)
analysis.index.name = "column"
analysis = analysis.fillna("N/A").reset_index().to_dict(orient="records")
return column_name, max_score, upper, analysis