Source code for dtale.views

from __future__ import absolute_import, division

import base64
import os
import time
import webbrowser
from builtins import map, range, str, zip
from functools import wraps
from logging import getLogger

from flask import current_app, json, make_response, redirect, render_template, request

import numpy as np
import pandas as pd
import requests
import xarray as xr
from scipy import stats
from six import string_types, StringIO

import dtale.global_state as global_state
from dtale import dtale
from dtale.charts.utils import build_base_chart, build_formatters
from dtale.cli.clickutils import retrieve_meta_info_and_version
from dtale.column_builders import ColumnBuilder
from dtale.column_filters import ColumnFilter
from dtale.column_replacements import ColumnReplacement
from dtale.dash_application.charts import (
    build_raw_chart,
    chart_url_params,
    chart_url_querystring,
    export_chart,
    export_chart_data,
    url_encode_func,
)
from dtale.data_reshapers import DataReshaper
from dtale.utils import (
    DuplicateDataError,
    build_code_export,
    build_query,
    build_shutdown_url,
    classify_type,
    dict_merge,
    divide_chunks,
    export_to_csv_buffer,
    find_dtype,
    find_dtype_formatter,
    find_selected_column,
    get_bool_arg,
    get_dtypes,
    get_int_arg,
    get_json_arg,
    get_str_arg,
    grid_columns,
    grid_formatter,
    json_date,
    json_float,
    json_int,
    json_timestamp,
    jsonify,
    jsonify_error,
    make_list,
    retrieve_grid_params,
    run_query,
    running_with_flask_debug,
    running_with_pytest,
    sort_df_for_grid,
)

logger = getLogger(__name__)
IDX_COL = str("dtale_index")


[docs]def exception_decorator(func): @wraps(func) def _handle_exceptions(*args, **kwargs): try: return func(*args, **kwargs) except BaseException as e: return jsonify_error(e) return _handle_exceptions
[docs]def head_data_id(): data_keys = global_state.get_data().keys() if not len(data_keys): raise Exception("No data associated with this D-Tale session") return sorted(data_keys)[0]
[docs]def in_ipython_frontend(): """ Helper function which is variation of :meth:`pandas:pandas.io.formats.console.in_ipython_frontend` which checks to see if we are inside an IPython zmq frontend :return: `True` if D-Tale is being invoked within ipython notebook, `False` otherwise """ try: from IPython import get_ipython ip = get_ipython() return "zmq" in str(type(ip)).lower() except BaseException: pass return False
[docs]def kill(base): """ This function fires a request to this instance's 'shutdown' route to kill it """ requests.get(build_shutdown_url(base))
[docs]def is_up(base): """ This function checks to see if instance's :mod:`flask:flask.Flask` process is up by hitting 'health' route. Using `verify=False` will allow us to validate instances being served up over SSL :return: `True` if :mod:`flask:flask.Flask` process is up and running, `False` otherwise """ try: return requests.get("{}/health".format(base), verify=False).ok except BaseException: return False
[docs]class DtaleData(object): """ Wrapper class to abstract the global state of a D-Tale process while allowing a user to programatically interact with a running D-Tale instance :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param url: endpoint for instances :class:`flask:flask.Flask` process :type url: str Attributes: _data_id data identifier _url :class:`flask:flask.Flask` endpoint _notebook_handle reference to the most recent :class:`ipython:IPython.display.DisplayHandle` created :Example: >>> import dtale >>> import pandas as pd >>> df = pd.DataFrame([dict(a=1,b=2,c=3)]) >>> d = dtale.show(df) >>> tmp = d.data.copy() >>> tmp['d'] = 4 >>> d.data = tmp >>> d.kill() """ def __init__(self, data_id, url): self._data_id = data_id self._url = url self._name_or_data_id = global_state.convert_name_to_url_path( (global_state.get_metadata(self._data_id) or {}).get("name") ) if self._name_or_data_id is None: self._name_or_data_id = self._data_id self._main_url = self.build_main_url() self._notebook_handle = None
[docs] def build_main_url(self, data_id=None): return "{}/dtale/main/{}".format(self._url, data_id or self._name_or_data_id)
@property def data(self): """ Property which is a reference to the globally stored data associated with this instance """ return global_state.get_data(self._data_id) @data.setter def data(self, data): """ Setter which will go through all standard formatting to make sure changes will be handled correctly by D-Tale """ startup(self._url, data=data, data_id=self._data_id)
[docs] def main_url(self): """ Helper function creating main :class:`flask:flask.Flask` route using instance's url & data_id :return: str """ if in_ipython_frontend(): print(self._main_url) return None return self._main_url
[docs] def kill(self): """ Helper function to pass instance's endpoint to :meth:`dtale.views.kill` """ kill(self._url)
[docs] def open_browser(self): """ This function uses the :mod:`python:webbrowser` library to try and automatically open server's default browser to this D-Tale instance """ webbrowser.get().open(self._main_url)
[docs] def is_up(self): """ Helper function to pass instance's endpoint to :meth:`dtale.views.is_up` """ return is_up(self._url)
def __str__(self): """ Will try to create an :class:`ipython:IPython.display.IFrame` if being invoked from within ipython notebook otherwise simply returns the output of running :meth:`pandas:pandas.DataFrame.__str__` on the data associated with this instance """ if in_ipython_frontend(): self.notebook() return "" return self.data.__str__() def __repr__(self): """ Will try to create an :class:`ipython:IPython.display.IFrame` if being invoked from within ipython notebook otherwise simply returns the output of running :meth:`pandas:pandas.DataFrame.__repr__` on the data for this instance """ if in_ipython_frontend(): self.notebook() if self._notebook_handle is not None: return "" return self.main_url() def _build_iframe( self, route="/dtale/iframe/", params=None, width="100%", height=475 ): """ Helper function to build an :class:`ipython:IPython.display.IFrame` if that module exists within your environment :param route: the :class:`flask:flask.Flask` route to hit on D-Tale :type route: str, optional :param params: properties & values passed as query parameters to the route :type params: dict, optional :param width: width of the ipython cell :type width: str or int, optional :param height: height of the ipython cell :type height: str or int, optional :return: :class:`ipython:IPython.display.IFrame` """ try: from IPython.display import IFrame except ImportError: logger.info("in order to use this function, please install IPython") return None iframe_url = "{}{}{}".format(self._url, route, self._data_id) if params is not None: if isinstance(params, string_types): # has this already been encoded? iframe_url = "{}?{}".format(iframe_url, params) else: iframe_url = "{}?{}".format(iframe_url, url_encode_func()(params)) return IFrame(iframe_url, width=width, height=height)
[docs] def notebook(self, route="/dtale/iframe/", params=None, width="100%", height=475): """ Helper function which checks to see if :mod:`flask:flask.Flask` process is up and running and then tries to build an :class:`ipython:IPython.display.IFrame` and run :meth:`ipython:IPython.display.display` on it so it will be displayed in the ipython notebook which invoked it. A reference to the :class:`ipython:IPython.display.DisplayHandle` is stored in _notebook_handle for updating if you are running ipython>=5.0 :param route: the :class:`flask:flask.Flask` route to hit on D-Tale :type route: str, optional :param params: properties & values passed as query parameters to the route :type params: dict, optional :param width: width of the ipython cell :type width: str or int, optional :param height: height of the ipython cell :type height: str or int, optional """ try: from IPython.display import display except ImportError: logger.info("in order to use this function, please install IPython") return self.data.__repr__() retries = 0 while not self.is_up() and retries < 10: time.sleep(0.01) retries += 1 self._notebook_handle = display( self._build_iframe(route=route, params=params, width=width, height=height), display_id=True, ) if self._notebook_handle is None: self._notebook_handle = True
[docs] def notebook_correlations(self, col1, col2, width="100%", height=475): """ Helper function to build an `ipython:IPython.display.IFrame` pointing at the correlations popup :param col1: column on left side of correlation :type col1: str :param col2: column on right side of correlation :type col2: str :param width: width of the ipython cell :type width: str or int, optional :param height: height of the ipython cell :type height: str or int, optional :return: :class:`ipython:IPython.display.IFrame` """ self.notebook( "/dtale/popup/correlations/", params=dict(col1=col1, col2=col2), width=width, height=height, )
[docs] def notebook_charts( self, chart_type="line", query=None, x=None, y=None, z=None, group=None, agg=None, window=None, rolling_comp=None, barmode=None, barsort=None, width="100%", height=800, ): """ Helper function to build an `ipython:IPython.display.IFrame` pointing at the charts popup :param chart_type: type of chart, possible options are line|bar|pie|scatter|3d_scatter|surface|heatmap :type chart_type: str :param query: pandas dataframe query string :type query: str, optional :param x: column to use for the X-Axis :type x: str :param y: columns to use for the Y-Axes :type y: list of str :param z: column to use for the Z-Axis :type z: str, optional :param group: column(s) to use for grouping :type group: list of str or str, optional :param agg: specific aggregation that can be applied to y or z axes. Possible values are: count, first, last, mean, median, min, max, std, var, mad, prod, sum. This is included in label of axis it is being applied to. :type agg: str, optional :param window: number of days to include in rolling aggregations :type window: int, optional :param rolling_comp: computation to use in rolling aggregations :type rolling_comp: str, optional :param barmode: mode to use for bar chart display. possible values are stack|group(default)|overlay|relative :type barmode: str, optional :param barsort: axis name to sort the bars in a bar chart by (default is the 'x', but other options are any of columns names used in the 'y' parameter :type barsort: str, optional :param width: width of the ipython cell :type width: str or int, optional :param height: height of the ipython cell :type height: str or int, optional :return: :class:`ipython:IPython.display.IFrame` """ params = dict( chart_type=chart_type, query=query, x=x, y=make_list(y), z=z, group=make_list(group), agg=agg, window=window, rolling_comp=rolling_comp, barmode=barmode, barsort=barsort, ) self.notebook( route="/charts/", params=chart_url_querystring(params), width=width, height=height, )
[docs] def offline_chart( self, chart_type=None, query=None, x=None, y=None, z=None, group=None, agg=None, window=None, rolling_comp=None, barmode=None, barsort=None, yaxis=None, filepath=None, title=None, **kwargs ): """ Builds the HTML for a plotly chart figure to saved to a file or output to a jupyter notebook :param chart_type: type of chart, possible options are line|bar|pie|scatter|3d_scatter|surface|heatmap :type chart_type: str :param query: pandas dataframe query string :type query: str, optional :param x: column to use for the X-Axis :type x: str :param y: columns to use for the Y-Axes :type y: list of str :param z: column to use for the Z-Axis :type z: str, optional :param group: column(s) to use for grouping :type group: list of str or str, optional :param agg: specific aggregation that can be applied to y or z axes. Possible values are: count, first, last, mean, median, min, max, std, var, mad, prod, sum. This is included in label of axis it is being applied to. :type agg: str, optional :param window: number of days to include in rolling aggregations :type window: int, optional :param rolling_comp: computation to use in rolling aggregations :type rolling_comp: str, optional :param barmode: mode to use for bar chart display. possible values are stack|group(default)|overlay|relative :type barmode: str, optional :param barsort: axis name to sort the bars in a bar chart by (default is the 'x', but other options are any of columns names used in the 'y' parameter :type barsort: str, optional :param yaxis: dictionary specifying the min/max for each y-axis in your chart :type yaxis: dict, optional :param filepath: location to save HTML output :type filepath: str, optional :param title: Title of your chart :type title: str, optional :param kwargs: optional keyword arguments, here in case invalid arguments are passed to this function :type kwargs: dict :return: possible outcomes are: - if run within a jupyter notebook and no 'filepath' is specified it will print the resulting HTML within a cell in your notebook - if 'filepath' is specified it will save the chart to the path specified - otherwise it will return the HTML output as a string """ params = dict( chart_type=chart_type, query=query, x=x, y=make_list(y), z=z, group=make_list(group), agg=agg, window=window, rolling_comp=rolling_comp, barmode=barmode, barsort=barsort, yaxis=yaxis, title=title, ) params = dict_merge(params, kwargs) if filepath is None and in_ipython_frontend(): from plotly.offline import iplot, init_notebook_mode init_notebook_mode(connected=True) chart = build_raw_chart(self._data_id, export=True, **params) chart.pop( "id", None ) # for some reason iplot does not like when the 'id' property is populated iplot(chart) return html_str = export_chart(self._data_id, params) if filepath is None: return html_str if not filepath.endswith(".html"): filepath = "{}.html".format(filepath) with open(filepath, "w") as f: f.write(html_str)
[docs] def adjust_cell_dimensions(self, width="100%", height=350): """ If you are running ipython>=5.0 then this will update the most recent notebook cell you displayed D-Tale in for this instance with the height/width properties you have passed in as input :param width: width of the ipython cell :param height: height of the ipython cell """ if self._notebook_handle is not None and hasattr( self._notebook_handle, "update" ): self._notebook_handle.update(self._build_iframe(width=width, height=height)) else: logger.debug("You must ipython>=5.0 installed to use this functionality")
[docs]def unique_count(s): return int(s.dropna().unique().size)
[docs]def get_dtype_info(data_id, col): dtypes = global_state.get_dtypes(data_id) return next((c for c in dtypes if c["name"] == col), None)
[docs]def dtype_formatter(data, dtypes, data_ranges, prev_dtypes=None): """ Helper function to build formatter for the descriptive information about each column in the dataframe you are viewing in D-Tale. This data is later returned to the browser to help with controlling inputs to functions which are heavily tied to specific data types. :param data: dataframe :type data: :class:`pandas:pandas.DataFrame` :param dtypes: column data type :type dtypes: dict :param data_ranges: dictionary containing minimum and maximum value for column (if applicable) :type data_ranges: dict, optional :param prev_dtypes: previous column information for syncing updates to pre-existing columns :type prev_dtypes: dict, optional :return: formatter function which takes column indexes and names :rtype: func """ def _formatter(col_index, col): visible = True dtype = dtypes.get(col) if prev_dtypes and col in prev_dtypes: visible = prev_dtypes[col].get("visible", True) s = data[col] dtype_data = dict( name=col, dtype=dtype, index=col_index, visible=visible, unique_ct=unique_count(s), hasMissing=int(s.isnull().sum()), hasOutliers=0, ) classification = classify_type(dtype) if ( classification in ["F", "I"] and not s.isnull().all() and col in data_ranges ): # floats/ints col_ranges = data_ranges[col] if not any((np.isnan(v) or np.isinf(v) for v in col_ranges.values())): dtype_data = dict_merge(col_ranges, dtype_data) # load outlier information o_s, o_e = calc_outlier_range(s) if not any((np.isnan(v) or np.isinf(v) for v in [o_s, o_e])): dtype_data["hasOutliers"] += int(((s < o_s) | (s > o_e)).sum()) dtype_data["outlierRange"] = dict(lower=o_s, upper=o_e) if classification in ["F", "I"] and not s.isnull().all(): # build variance flag unique_ct = dtype_data["unique_ct"] check1 = (unique_ct / len(data[col])) < 0.1 check2 = False if check1 and unique_ct >= 2: val_counts = s.value_counts() check2 = (val_counts.values[0] / val_counts.values[1]) > 20 dtype_data["lowVariance"] = bool(check1 and check2) if classification == "S" and not dtype_data["hasMissing"]: if dtype.startswith("category"): dtype_data["hasMissing"] += int( (s.apply(lambda x: x.strip()) == "").sum() ) else: dtype_data["hasMissing"] += int((s.str.strip() == "").sum()) return dtype_data return _formatter
[docs]def build_dtypes_state(data, prev_state=None): """ Helper function to build globally managed state pertaining to a D-Tale instances columns & data types :param data: dataframe to build data type information for :type data: :class:`pandas:pandas.DataFrame` :return: a list of dictionaries containing column names, indexes and data types """ prev_dtypes = {c["name"]: c for c in prev_state or []} try: ranges = data.agg(["min", "max"]).to_dict() except ValueError: # I've seen when transposing data and data types get combined into one column this exception emerges # when calling 'agg' on the new data ranges = {} dtype_f = dtype_formatter(data, get_dtypes(data), ranges, prev_dtypes) return [dtype_f(i, c) for i, c in enumerate(data.columns)]
[docs]def format_data(data, inplace=False, drop_index=False): """ Helper function to build globally managed state pertaining to a D-Tale instances data. Some updates being made: - convert all column names to strings - drop any indexes back into the dataframe so what we are left is a natural index [0,1,2,...,n] - convert inputs that are indexes into dataframes - replace any periods in column names with underscores :param data: dataframe to build data type information for :type data: :class:`pandas:pandas.DataFrame` :param allow_cell_edits: If false, this will not allow users to edit cells directly in their D-Tale grid :type allow_cell_edits: bool, optional :param inplace: If true, this will call `reset_index(inplace=True)` on the dataframe used as a way to save memory. Otherwise this will create a brand new dataframe, thus doubling memory but leaving the dataframe input unchanged. :type inplace: bool, optional :param drop_index: If true, this will drop any pre-existing index on the dataframe input. :type drop_index: bool, optional :return: formatted :class:`pandas:pandas.DataFrame` and a list of strings constituting what columns were originally in the index """ if isinstance(data, (pd.DatetimeIndex, pd.MultiIndex)): data = data.to_frame(index=False) index = [ str(i) for i in make_list(data.index.name or data.index.names) if i is not None ] drop = True if not len(index) and not data.index.equals(pd.RangeIndex(0, len(data))): drop = False index = ["index"] if inplace: data.reset_index(inplace=True, drop=drop_index) else: data = data.reset_index(drop=drop_index) if drop: if inplace: data.drop("index", axis=1, errors="ignore", inplace=True) else: data = data.drop("index", axis=1, errors="ignore") data.columns = [str(c) for c in data.columns] return data, index
[docs]def check_duplicate_data(data): """ This function will do a rough check to see if a user has already loaded this piece of data to D-Tale to avoid duplicated state. The checks that take place are: - shape (# of rows & # of columns - column names and ordering of columns (eventually might add dtype checking as well...) :param data: dataframe to validate :type data: :class:`pandas:pandas.DataFrame` :raises :class:`dtale.utils.DuplicateDataError`: if duplicate data exists """ cols = [str(col) for col in data.columns] for d_id, d_df in global_state.get_data().items(): d_cols = [str(col) for col in d_df.columns] if d_df.shape == data.shape and cols == d_cols: raise DuplicateDataError(d_id)
[docs]def convert_xarray_to_dataset(dataset, **indexers): def _convert_zero_dim_dataset(dataset): ds_dict = dataset.to_dict() data = {} for coord, coord_data in ds_dict["coords"].items(): data[coord] = coord_data["data"] for col, col_data in ds_dict["data_vars"].items(): data[col] = col_data["data"] return pd.DataFrame([data]).set_index(list(ds_dict["coords"].keys())) ds_sel = dataset.sel(**indexers) try: df = ds_sel.to_dataframe() df = df.reset_index().drop("index", axis=1, errors="ignore") return df.set_index(list(dataset.dims.keys())) except ValueError: return _convert_zero_dim_dataset(ds_sel)
[docs]def handle_koalas(data): """ Helper function to check if koalas is installed and also if incoming data is a koalas dataframe, if so convert it to :class:`pandas:pandas.DataFrame`, otherwise simply return the original data structure. :param data: data we want to check if its a koalas dataframe and if so convert to :class:`pandas:pandas.DataFrame` :return: :class:`pandas:pandas.DataFrame` """ if is_koalas(data): return data.to_pandas() return data
[docs]def is_koalas(data): try: from databricks.koalas import frame return isinstance(data, frame.DataFrame) except BaseException: return False
[docs]def startup( url, data=None, data_loader=None, name=None, data_id=None, context_vars=None, ignore_duplicate=False, allow_cell_edits=True, inplace=False, drop_index=False, hide_shutdown=False, github_fork=False, ): """ Loads and stores data globally - If data has indexes then it will lock save those columns as locked on the front-end - If data has column named index it will be dropped so that it won't collide with row numbering (dtale_index) - Create location in memory for storing settings which can be manipulated from the front-end (sorts, filter, ...) :param data: :class:`pandas:pandas.DataFrame` or :class:`pandas:pandas.Series` :param data_loader: function which returns :class:`pandas:pandas.DataFrame` :param name: string label to apply to your session :param data_id: integer id assigned to a piece of data viewable in D-Tale, if this is populated then it will override the data at that id :param context_vars: a dictionary of the variables that will be available for use in user-defined expressions, such as filters :type context_vars: dict, optional :param ignore_duplicate: if set to True this will not test whether this data matches any previously loaded to D-Tale :param allow_cell_edits: If false, this will not allow users to edit cells directly in their D-Tale grid :type allow_cell_edits: bool, optional :param inplace: If true, this will call `reset_index(inplace=True)` on the dataframe used as a way to save memory. Otherwise this will create a brand new dataframe, thus doubling memory but leaving the dataframe input unchanged. :type inplace: bool, optional :param drop_index: If true, this will drop any pre-existing index on the dataframe input. :type drop_index: bool, optional :param hide_shutdown: If true, this will hide the "Shutdown" button from users :type hide_shutdown: bool, optional :param github_fork: If true, this will display a "Fork Me On GitHub" ribbon in the upper right-hand corner of the app :type github_fork: bool, optional """ if data_loader is not None: data = data_loader() if data is not None: data = handle_koalas(data) if not isinstance( data, (pd.DataFrame, pd.Series, pd.DatetimeIndex, pd.MultiIndex, xr.Dataset) ): raise Exception( ( "data loaded must be one of the following types: pandas.DataFrame, pandas.Series, " "pandas.DatetimeIndex, pandas.MultiIndex, xarray.Dataset" ) ) if isinstance(data, xr.Dataset): df = convert_xarray_to_dataset(data) instance = startup( url, df, name=name, data_id=data_id, context_vars=context_vars, ignore_duplicate=ignore_duplicate, allow_cell_edits=allow_cell_edits, hide_shutdown=hide_shutdown, github_fork=github_fork, ) global_state.set_dataset(instance._data_id, data) global_state.set_dataset_dim(instance._data_id, {}) return instance data, curr_index = format_data(data, inplace=inplace, drop_index=drop_index) # check to see if this dataframe has already been loaded to D-Tale if data_id is None and not ignore_duplicate: check_duplicate_data(data) logger.debug( "pytest: {}, flask-debug: {}".format( running_with_pytest(), running_with_flask_debug() ) ) if len(data.columns) > len(set(data.columns)): distinct_cols = set() dupes = set() for c in data.columns: if c in distinct_cols: dupes.add(c) distinct_cols.add(c) raise Exception( "data contains duplicated column names: {}".format( ", ".join(sorted(dupes)) ) ) if data_id is None: data_id = str( int(max(global_state.get_data().keys())) + 1 if len(global_state.get_data()) else 1 ) if data_id in global_state.get_settings(): curr_settings = global_state.get_settings(data_id) curr_locked = curr_settings.get("locked", []) # filter out previous locked columns that don't exist curr_locked = [c for c in curr_locked if c in data.columns] # add any new columns in index curr_locked += [c for c in curr_index if c not in curr_locked] else: logger.debug( "pre-locking index columns ({}) to settings[{}]".format( curr_index, data_id ) ) curr_locked = curr_index global_state.set_metadata( data_id, dict(start=pd.Timestamp("now"), name=name) ) # in the case that data has been updated we will drop any sorts or filter for ease of use base_settings = dict( locked=curr_locked, allow_cell_edits=allow_cell_edits, hide_shutdown=hide_shutdown, github_fork=github_fork, ) global_state.set_settings(data_id, base_settings) global_state.set_data(data_id, data) global_state.set_dtypes( data_id, build_dtypes_state(data, global_state.get_dtypes(data_id) or []) ) global_state.set_context_variables( data_id, build_context_variables(data_id, context_vars) ) return DtaleData(data_id, url) else: raise Exception("data loaded is None!")
[docs]def base_render_template(template, data_id, **kwargs): """ Overriden version of Flask.render_template which will also include vital instance information - settings - version - processes """ if not len(os.listdir("{}/static/dist".format(os.path.dirname(__file__)))): return redirect(current_app.url_for("missing_js")) curr_settings = global_state.get_settings(data_id) or {} _, version = retrieve_meta_info_and_version("dtale") return render_template( template, data_id=data_id, xarray=data_id in global_state.DATASETS, xarray_dim=json.dumps(global_state.get_dataset_dim(data_id)), settings=json.dumps(curr_settings), version=str(version), processes=len(global_state.get_data()), allow_cell_edits=global_state.load_flag(data_id, "allow_cell_edits", True), hide_shutdown=global_state.load_flag(data_id, "hide_shutdown", False), github_fork=global_state.load_flag(data_id, "github_fork", False), **kwargs )
def _view_main(data_id, iframe=False): """ Helper function rendering main HTML which will also build title and store whether we are viewing from an <iframe> :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param iframe: boolean flag indicating whether this is being viewed from an <iframe> (usually means ipython) :type iframe: bool, optional :return: HTML """ curr_metadata = global_state.get_metadata(data_id) or {} title = "D-Tale" if curr_metadata.get("name"): title = "{} ({})".format(title, curr_metadata["name"]) return base_render_template("dtale/main.html", data_id, title=title, iframe=iframe)
[docs]@dtale.route("/main") @dtale.route("/main/<data_id>") def view_main(data_id=None): """ :class:`flask:flask.Flask` route which serves up base jinja template housing JS files :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :return: HTML """ if data_id is None or data_id not in global_state.get_data().keys(): return redirect("/dtale/main/{}".format(head_data_id())) return _view_main(data_id)
[docs]@dtale.route("/iframe") @dtale.route("/iframe/<data_id>") def view_iframe(data_id=None): """ :class:`flask:flask.Flask` route which serves up base jinja template housing JS files :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :return: HTML """ if data_id is None: return redirect("/dtale/iframe/{}".format(head_data_id())) return _view_main(data_id, iframe=True)
[docs]@dtale.route("/popup/<popup_type>") @dtale.route("/popup/<popup_type>/<data_id>") def view_popup(popup_type, data_id=None): """ :class:`flask:flask.Flask` route which serves up a base jinja template for any popup, additionally forwards any request parameters as input to template. :param popup_type: type of popup to be opened. Possible values: charts, correlations, describe, histogram, instances :type popup_type: str :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :return: HTML """ if data_id is None: return redirect("/dtale/popup/{}/{}".format(popup_type, head_data_id())) curr_metadata = global_state.get_metadata(data_id) or {} title = "D-Tale" if curr_metadata.get("name"): title = "{} ({})".format(title, curr_metadata["name"]) if popup_type == "reshape": popup_title = "Summarize Data" elif popup_type == "filter": popup_title = "Custom Filter" else: popup_title = " ".join([pt.capitalize() for pt in popup_type.split("-")]) title = "{} - {}".format(title, popup_title) params = request.args.to_dict() if len(params): def pretty_print(obj): return ", ".join(["{}: {}".format(k, str(v)) for k, v in obj.items()]) title = "{} ({})".format(title, pretty_print(params)) return base_render_template( "dtale/popup.html", data_id, title=title, popup_title=popup_title, js_prefix=popup_type, )
[docs]@dtale.route("/code-popup") def view_code_popup(): """ :class:`flask:flask.Flask` route which serves up a base jinja template for code snippets :return: HTML """ return render_template("dtale/code_popup.html")
[docs]@dtale.route("/processes") @exception_decorator def get_processes(): """ :class:`flask:flask.Flask` route which returns list of running D-Tale processes within current python process :return: JSON { data: [ { port: 1, name: 'name1', rows: 5, columns: 5, names: 'col1,...,col5', start: '2018-04-30 12:36:44', ts: 1525106204000 }, ..., { port: N, name: 'nameN', rows: 5, columns: 5, names: 'col1,...,col5', start: '2018-04-30 12:36:44', ts: 1525106204000 } ], success: True/False } """ def _load_process(data_id): data = global_state.get_data(data_id) dtypes = global_state.get_dtypes(data_id) mdata = global_state.get_metadata(data_id) return dict( data_id=data_id, rows=len(data), columns=len(dtypes), names=",".join([c["name"] for c in dtypes]), start=json_date(mdata["start"]), ts=json_timestamp(mdata["start"]), name=mdata["name"], ) processes = sorted( [_load_process(data_id) for data_id in global_state.get_data()], key=lambda p: p["ts"], ) return jsonify(dict(data=processes, success=True))
[docs]@dtale.route("/update-settings/<data_id>") @exception_decorator def update_settings(data_id): """ :class:`flask:flask.Flask` route which updates global SETTINGS for current port :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param settings: JSON string from flask.request.args['settings'] which gets decoded and stored in SETTINGS variable :return: JSON """ curr_settings = global_state.get_settings(data_id) or {} updated_settings = dict_merge(curr_settings, get_json_arg(request, "settings", {})) global_state.set_settings(data_id, updated_settings) return jsonify(dict(success=True))
[docs]@dtale.route("/update-formats/<data_id>") @exception_decorator def update_formats(data_id): """ :class:`flask:flask.Flask` route which updates the "formats" property for global SETTINGS associated w/ the current port :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param all: boolean flag which, if true, tells us we should apply this formatting to all columns with the same data type as our selected column :param col: selected column :param format: JSON string for the formatting configuration we want applied to either the selected column of all columns with the selected column's data type :return: JSON """ update_all_dtype = get_bool_arg(request, "all") nan_display = get_str_arg(request, "nanDisplay") if nan_display is None: nan_display = "nan" col = get_str_arg(request, "col") col_format = get_json_arg(request, "format") curr_settings = global_state.get_settings(data_id) or {} updated_formats = {col: col_format} if update_all_dtype: col_dtype = get_dtype_info(data_id, col)["dtype"] updated_formats = { c["name"]: col_format for c in global_state.get_dtypes(data_id) if c["dtype"] == col_dtype } updated_formats = dict_merge(curr_settings.get("formats") or {}, updated_formats) updated_settings = dict_merge( curr_settings, dict(formats=updated_formats, nanDisplay=nan_display) ) global_state.set_settings(data_id, updated_settings) return jsonify(dict(success=True))
[docs]def refresh_col_indexes(data_id): """ Helper function to sync column indexes to current state of dataframe for data_id. """ curr_dtypes = {c["name"]: c for c in global_state.get_dtypes(data_id)} curr_data = global_state.get_data(data_id) global_state.set_dtypes( data_id, [ dict_merge(curr_dtypes[c], dict(index=idx)) for idx, c in enumerate(curr_data.columns) ], )
[docs]@dtale.route("/update-column-position/<data_id>") @exception_decorator def update_column_position(data_id): """ :class:`flask:flask.Flask` route to handle moving of columns within a :class:`pandas:pandas.DataFrame`. Columns can be moved in one of these 4 directions: front, back, left, right :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param action: string from flask.request.args['action'] of direction to move column :param col: string from flask.request.args['col'] of column name to move :return: JSON {success: True/False} """ action = get_str_arg(request, "action") col = get_str_arg(request, "col") curr_cols = global_state.get_data(data_id).columns.tolist() if action == "front": curr_cols = [col] + [c for c in curr_cols if c != col] elif action == "back": curr_cols = [c for c in curr_cols if c != col] + [col] elif action == "left": if curr_cols[0] != col: col_idx = next((idx for idx, c in enumerate(curr_cols) if c == col), None) col_to_shift = curr_cols[col_idx - 1] curr_cols[col_idx - 1] = col curr_cols[col_idx] = col_to_shift elif action == "right": if curr_cols[-1] != col: col_idx = next((idx for idx, c in enumerate(curr_cols) if c == col), None) col_to_shift = curr_cols[col_idx + 1] curr_cols[col_idx + 1] = col curr_cols[col_idx] = col_to_shift global_state.set_data(data_id, global_state.get_data(data_id)[curr_cols]) refresh_col_indexes(data_id) return jsonify(success=True)
[docs]@dtale.route("/update-locked/<data_id>") @exception_decorator def update_locked(data_id): """ :class:`flask:flask.Flask` route to handle saving state associated with locking and unlocking columns :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param action: string from flask.request.args['action'] of action to perform (lock or unlock) :param col: string from flask.request.args['col'] of column name to lock/unlock :return: JSON {success: True/False} """ action = get_str_arg(request, "action") col = get_str_arg(request, "col") curr_settings = global_state.get_settings(data_id) curr_data = global_state.get_data(data_id) if action == "lock" and col not in curr_settings["locked"]: curr_settings["locked"].append(col) elif action == "unlock": curr_settings["locked"] = [c for c in curr_settings["locked"] if c != col] final_cols = curr_settings["locked"] + [ c for c in curr_data.columns if c not in curr_settings["locked"] ] global_state.set_data(data_id, curr_data[final_cols]) global_state.set_settings(data_id, curr_settings) refresh_col_indexes(data_id) return jsonify(success=True)
[docs]@dtale.route("/update-visibility/<data_id>", methods=["POST"]) @exception_decorator def update_visibility(data_id): """ :class:`flask:flask.Flask` route to handle saving state associated visiblity of columns on the front-end :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param visibility: string from flask.request.args['action'] of dictionary of visibility of all columns in a dataframe :type visibility: dict, optional :param toggle: string from flask.request.args['col'] of column name whose visibility should be toggled :type toggle: str, optional :return: JSON {success: True/False} """ curr_dtypes = global_state.get_dtypes(data_id) if request.form.get("visibility"): visibility = json.loads(request.form.get("visibility", "{}")) global_state.set_dtypes( data_id, [dict_merge(d, dict(visible=visibility[d["name"]])) for d in curr_dtypes], ) elif request.form.get("toggle"): toggle_col = request.form.get("toggle") toggle_idx = next( (idx for idx, d in enumerate(curr_dtypes) if d["name"] == toggle_col), None ) toggle_cfg = curr_dtypes[toggle_idx] curr_dtypes[toggle_idx] = dict_merge( toggle_cfg, dict(visible=not toggle_cfg["visible"]) ) global_state.set_dtypes(data_id, curr_dtypes) return jsonify(success=True)
[docs]@dtale.route("/build-column/<data_id>") @exception_decorator def build_column(data_id): """ :class:`flask:flask.Flask` route to handle the building of new columns in a dataframe. Some of the operations the are available are: - numeric: sum/difference/multiply/divide any combination of two columns or static values - datetime: retrieving date properties (hour, minute, month, year...) or conversions of dates (month start, month end, quarter start...) - bins: bucketing numeric data into bins using :meth:`pandas:pandas.cut` & :meth:`pandas:pandas.qcut` :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param name: string from flask.request.args['name'] of new column to create :param type: string from flask.request.args['type'] of the type of column to build (numeric/datetime/bins) :param cfg: dict from flask.request.args['cfg'] of how to calculate the new column :return: JSON {success: True/False} """ data = global_state.get_data(data_id) col_type = get_str_arg(request, "type") cfg = json.loads(get_str_arg(request, "cfg")) save_as = get_str_arg(request, "saveAs", "new") if save_as == "inplace": name = cfg["col"] else: name = get_str_arg(request, "name") if not name and col_type != "type_conversion": raise Exception("'name' is required for new column!") # non-type conversions cannot be done inplace and thus need a name and the name needs to be checked that it # won't overwrite something else name = str(name) data = global_state.get_data(data_id) if name in data.columns: raise Exception("A column named '{}' already exists!".format(name)) def _build_column(): builder = ColumnBuilder(data_id, col_type, name, cfg) data.loc[:, name] = builder.build_column() dtype = find_dtype(data[name]) data_ranges = {} if classify_type(dtype) == "F" and not data[name].isnull().all(): try: data_ranges[name] = data[[name]].agg(["min", "max"]).to_dict()[name] except ValueError: pass dtype_f = dtype_formatter(data, {name: dtype}, data_ranges) global_state.set_data(data_id, data) curr_dtypes = global_state.get_dtypes(data_id) new_dtype_info = dtype_f(len(curr_dtypes), name) if next((cdt for cdt in curr_dtypes if cdt["name"] == name), None): curr_dtypes = [ new_dtype_info if cdt["name"] == name else cdt for cdt in curr_dtypes ] else: curr_dtypes.append(dtype_f(len(curr_dtypes), name)) global_state.set_dtypes(data_id, curr_dtypes) curr_history = global_state.get_history(data_id) or [] curr_history += make_list(builder.build_code()) global_state.set_history(data_id, curr_history) if cfg.get("applyAllType", False): cols = [ dtype["name"] for dtype in global_state.get_dtypes(data_id) if dtype["dtype"] == cfg["from"] ] for col in cols: cfg = dict_merge(cfg, dict(col=col)) name = col _build_column() else: _build_column() return jsonify(success=True)
[docs]@dtale.route("/bins-tester/<data_id>") @exception_decorator def build_column_bins_tester(data_id): col_type = get_str_arg(request, "type") cfg = json.loads(get_str_arg(request, "cfg")) builder = ColumnBuilder(data_id, col_type, cfg["col"], cfg) data = global_state.get_data(data_id) data, labels = builder.builder.build_test(data) return jsonify(dict(data=data, labels=labels))
[docs]@dtale.route("/reshape/<data_id>") @exception_decorator def reshape_data(data_id): output = get_str_arg(request, "output") shape_type = get_str_arg(request, "type") cfg = json.loads(get_str_arg(request, "cfg")) builder = DataReshaper(data_id, shape_type, cfg) if output == "new": instance = startup("", data=builder.reshape(), ignore_duplicate=True) else: instance = startup( "", data=builder.reshape(), data_id=data_id, ignore_duplicate=True ) curr_settings = global_state.get_settings(instance._data_id) global_state.set_settings( instance._data_id, dict_merge(curr_settings, dict(startup_code=builder.build_code())), ) return jsonify(success=True, data_id=instance._data_id)
[docs]@dtale.route("/build-replacement/<data_id>") @exception_decorator def build_replacement(data_id): """ :class:`flask:flask.Flask` route to handle the replacement of specific values within a column in a dataframe. Some of the operations the are available are: - spaces: replace values consisting of only spaces with a specific value - value: replace specific values with a specific value or aggregation - strings: replace values which contain a specific character or string (case-insensitive or not) with a specific value - imputer: replace nan values using sklearn imputers iterative, knn or simple :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param col: string from flask.request.args['col'] of the column to perform replacements upon :param type: string from flask.request.args['type'] of the type of replacement to perform (spaces/fillna/strings/imputer) :param cfg: dict from flask.request.args['cfg'] of how to calculate the replacements :return: JSON {success: True/False} """ def build_data_ranges(data, col, dtype): data_ranges = {} if classify_type(dtype) == "F" and not data[col].isnull().all(): try: data_ranges[col] = data[[col]].agg(["min", "max"]).to_dict()[col] except ValueError: pass return data_ranges data = global_state.get_data(data_id) name = get_str_arg(request, "name") if name is not None: name = str(name) if name in data.columns: raise Exception("A column named '{}' already exists!".format(name)) col = get_str_arg(request, "col") replacement_type = get_str_arg(request, "type") cfg = json.loads(get_str_arg(request, "cfg")) builder = ColumnReplacement(data_id, col, replacement_type, cfg) output = builder.build_replacements() dtype = find_dtype(output) curr_dtypes = global_state.get_dtypes(data_id) if name is not None: data.loc[:, name] = output dtype_f = dtype_formatter( data, {name: dtype}, build_data_ranges(data, name, dtype) ) curr_dtypes.append(dtype_f(len(curr_dtypes), name)) else: data.loc[:, col] = output dtype_f = dtype_formatter( data, {col: dtype}, build_data_ranges(data, col, dtype) ) col_index = next( (i for i, d in enumerate(curr_dtypes) if d["name"] == col), None ) curr_col_dtype = dtype_f(col_index, col) curr_dtypes = [curr_col_dtype if d["name"] == col else d for d in curr_dtypes] global_state.set_data(data_id, data) global_state.set_dtypes(data_id, curr_dtypes) curr_history = global_state.get_history(data_id) or [] curr_history += [builder.build_code()] global_state.set_history(data_id, curr_history) return jsonify(success=True)
[docs]@dtale.route("/test-filter/<data_id>") @exception_decorator def test_filter(data_id): """ :class:`flask:flask.Flask` route which will test out pandas query before it gets applied to DATA and return exception information to the screen if there is any :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :return: JSON {success: True/False} """ query = get_str_arg(request, "query") run_query( global_state.get_data(data_id), build_query(data_id, query), global_state.get_context_variables(data_id), ) if get_str_arg(request, "save"): curr_settings = global_state.get_settings(data_id) or {} if query is not None: curr_settings = dict_merge(curr_settings, dict(query=query)) else: curr_settings = {k: v for k, v in curr_settings.items() if k != "query"} global_state.set_settings(data_id, curr_settings) return jsonify(dict(success=True))
[docs]@dtale.route("/dtypes/<data_id>") @exception_decorator def dtypes(data_id): """ :class:`flask:flask.Flask` route which returns a list of column names and dtypes to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :return: JSON { dtypes: [ {index: 1, name: col1, dtype: int64}, ..., {index: N, name: colN, dtype: float64} ], success: True/False } """ return jsonify(dtypes=global_state.get_dtypes(data_id), success=True)
[docs]def load_describe(column_series, additional_aggs=None): """ Helper function for grabbing the output from :meth:`pandas:pandas.Series.describe` in a JSON serializable format :param column_series: data to describe :type column_series: :class:`pandas:pandas.Series` :return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe` """ desc = column_series.describe().to_frame().T code = [ "# main statistics", "stats = df['{col}'].describe().to_frame().T".format(col=column_series.name), ] if additional_aggs: for agg in additional_aggs: if agg == "mode": mode = column_series.mode().values desc["mode"] = np.nan if len(mode) > 1 else mode[0] code.append( ( "# mode\n" "mode = df['{col}'].mode().values\n" "stats['mode'] = np.nan if len(mode) > 1 else mode[0]" ).format(col=column_series.name) ) continue desc[agg] = getattr(column_series, agg)() code.append( "# {agg}\nstats['{agg}'] = df['{col}'].{agg}()".format( col=column_series.name, agg=agg ) ) desc_f_overrides = { "I": lambda f, i, c: f.add_int(i, c, as_string=True), "F": lambda f, i, c: f.add_float(i, c, precision=4, as_string=True), } desc_f = grid_formatter( grid_columns(desc), nan_display="nan", overrides=desc_f_overrides ) desc = desc_f.format_dict(next(desc.itertuples(), None)) if "count" in desc: # pandas always returns 'count' as a float and it adds useless decimal points desc["count"] = desc["count"].split(".")[0] desc["total_count"] = len(column_series) missing_ct = column_series.isnull().sum() desc["missing_pct"] = json_float((missing_ct / len(column_series) * 100).round(2)) desc["missing_ct"] = json_int(missing_ct) return desc, code
[docs]@dtale.route("/describe/<data_id>/<column>") @exception_decorator def describe(data_id, column): """ :class:`flask:flask.Flask` route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ data = global_state.get_data(data_id)[[column]] additional_aggs = None dtype = get_dtype_info(data_id, column) if classify_type(dtype["dtype"]) in ["I", "F"]: additional_aggs = ["sum", "median", "mode", "var", "sem", "skew", "kurt"] code = build_code_export(data_id) desc, desc_code = load_describe(data[column], additional_aggs=additional_aggs) code += desc_code return_data = dict(describe=desc, success=True) if "unique" not in return_data["describe"]: return_data["describe"]["unique"] = json_int(dtype["unique_ct"], as_string=True) uniq_vals = data[column].value_counts().sort_values(ascending=False) uniq_vals.index.name = "value" uniq_vals.name = "count" uniq_vals = uniq_vals.reset_index() code.append( ( "uniq_vals = data['{}'].value_counts().sort_values(ascending=False)\n" "uniq_vals.index.name = 'value'\n" "uniq_vals.name = 'count'\n" "uniq_vals = uniq_vals.reset_index()" ).format(column) ) if dtype["dtype"].startswith("mixed"): uniq_vals["type"] = uniq_vals["value"].apply(lambda i: type(i).__name__) dtype_counts = uniq_vals.groupby("type").sum().reset_index() dtype_counts.columns = ["dtype", "count"] return_data["dtype_counts"] = dtype_counts.to_dict(orient="records") code.append( ( "uniq_vals['type'] = uniq_vals['value'].apply( lambda i: type(i).__name__)\n" "dtype_counts = uniq_vals.groupby('type').sum().reset_index()\n" "dtype_counts.columns = ['dtype', 'count']" ) ) else: uniq_vals.loc[:, "type"] = find_dtype(uniq_vals["value"]) code.append( "uniq_vals.loc[:, 'type'] = '{}'".format(uniq_vals["type"].values[0]) ) return_data["uniques"] = {} for uniq_type, uniq_grp in uniq_vals.groupby("type"): total = len(uniq_grp) top = total > 100 uniq_grp = uniq_grp[["value", "count"]].head(100) uniq_grp["value"] = uniq_grp["value"].astype(uniq_type) uniq_f, _ = build_formatters(uniq_grp) return_data["uniques"][uniq_type] = dict( data=uniq_f.format_dicts(uniq_grp.itertuples()), total=total, top=top, ) return_data["code"] = "\n".join(code) return jsonify(return_data)
[docs]@dtale.route("/variance/<data_id>/<column>") @exception_decorator def variance(data_id, column): """ :class:`flask:flask.Flask` route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ s = global_state.get_data(data_id)[column] code = ["s = df['{}']".format(column)] unique_ct = unique_count(s) code.append("unique_ct = s.unique().size") s_size = len(s) code.append("s_size = len(s)") check1 = bool((unique_ct / s_size) < 0.1) code.append("check1 = (unique_ct / s_size) < 0.1") return_data = dict(check1=dict(unique=unique_ct, size=s_size, result=check1)) dtype = get_dtype_info(data_id, column) if unique_ct >= 2: val_counts = s.value_counts() check2 = bool((val_counts.values[0] / val_counts.values[1]) > 20) fmt = find_dtype_formatter(dtype["dtype"]) return_data["check2"] = dict( val1=dict(val=fmt(val_counts.index[0]), ct=int(val_counts.values[0])), val2=dict(val=fmt(val_counts.index[1]), ct=int(val_counts.values[1])), result=check2, ) code += [ "check2 = False", "if unique_ct > 1:", "\tval_counts = s.value_counts()", "\tcheck2 = (val_counts.values[0] / val_counts.values[1]) > 20", "low_variance = check1 and check2", ] return_data["size"] = len(s) return_data["outlierCt"] = dtype["hasOutliers"] return_data["missingCt"] = int(s.isnull().sum()) jb_stat, jb_p = stats.jarque_bera(s) return_data["jarqueBera"] = dict(statistic=float(jb_stat), pvalue=float(jb_p)) sw_stat, sw_p = stats.shapiro(s) return_data["shapiroWilk"] = dict(statistic=float(sw_stat), pvalue=float(sw_p)) code += [ "\nimport scipy.stats as stats\n", "jb_stat, jb_p = stats.jarque_bera(s)", "sw_stat, sw_p = stats.shapiro(s)", ] return_data["code"] = "\n".join(code) return jsonify(return_data)
[docs]def calc_outlier_range(s): q1 = s.quantile(0.25) q3 = s.quantile(0.75) iqr = q3 - q1 iqr_lower = q1 - 1.5 * iqr iqr_upper = q3 + 1.5 * iqr return iqr_lower, iqr_upper
[docs]@dtale.route("/outliers/<data_id>/<column>") @exception_decorator def outliers(data_id, column): df = global_state.get_data(data_id) s = df[column] iqr_lower, iqr_upper = calc_outlier_range(s) formatter = find_dtype_formatter(find_dtype(df[column])) outliers = s[(s < iqr_lower) | (s > iqr_upper)].unique() if not len(outliers): return jsonify(outliers=[]) top = len(outliers) > 100 outliers = [formatter(v) for v in outliers[:100]] queries = [] if iqr_lower > s.min(): queries.append( "{column} < {lower}".format(column=column, lower=json_float(iqr_lower)) ) if iqr_upper < s.max(): queries.append( "{column} > {upper}".format(column=column, upper=json_float(iqr_upper)) ) query = "(({}))".format(") or (".join(queries)) if len(queries) > 1 else queries[0] code = ( "s = df['{column}']\n" "q1 = s.quantile(0.25)\n" "q3 = s.quantile(0.75)\n" "iqr = q3 - q1\n" "iqr_lower = q1 - 1.5 * iqr\n" "iqr_upper = q3 + 1.5 * iqr\n" "outliers = dict(s[(s < iqr_lower) | (s > iqr_upper)])" ).format(column=column) queryApplied = column in ( (global_state.get_settings(data_id) or {}).get("outlierFilters") or {} ) return jsonify( outliers=outliers, query=query, code=code, queryApplied=queryApplied, top=top )
[docs]@dtale.route("/delete-col/<data_id>/<column>") @exception_decorator def delete_col(data_id, column): data = global_state.get_data(data_id) data = data[[c for c in data.columns if c != column]] curr_history = global_state.get_history(data_id) or [] curr_history += ["df = df[[c for c in df.columns if c != '{}']]".format(column)] global_state.set_history(data_id, curr_history) dtypes = global_state.get_dtypes(data_id) dtypes = [dt for dt in dtypes if dt["name"] != column] curr_settings = global_state.get_settings(data_id) curr_settings["locked"] = [ c for c in curr_settings.get("locked", []) if c != column ] global_state.set_data(data_id, data) global_state.set_dtypes(data_id, dtypes) global_state.set_settings(data_id, curr_settings) return jsonify(success=True)
[docs]@dtale.route("/rename-col/<data_id>/<column>") @exception_decorator def rename_col(data_id, column): rename = get_str_arg(request, "rename") data = global_state.get_data(data_id) if column != rename and rename in data.columns: return jsonify(error='Column name "{}" already exists!') data = data.rename(columns={column: rename}) curr_history = global_state.get_history(data_id) or [] curr_history += ["df = df.rename(columns={'%s': '%s'})" % (column, rename)] global_state.set_history(data_id, curr_history) dtypes = global_state.get_dtypes(data_id) dtypes = [ dict_merge(dt, {"name": rename}) if dt["name"] == column else dt for dt in dtypes ] curr_settings = global_state.get_settings(data_id) curr_settings["locked"] = [ rename if c == column else c for c in curr_settings.get("locked", []) ] global_state.set_data(data_id, data) global_state.set_dtypes(data_id, dtypes) global_state.set_settings(data_id, curr_settings) return jsonify(success=True)
[docs]@dtale.route("/edit-cell/<data_id>/<column>") @exception_decorator def edit_cell(data_id, column): row_index = get_int_arg(request, "rowIndex") updated = get_str_arg(request, "updated") updated_str = updated curr_settings = global_state.get_settings(data_id) data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ignore_empty=True, ) dtype = find_dtype(data[column]) code = [] if updated in ["nan", "inf"]: updated_str = "np.{}".format(updated) updated = getattr(np, updated) data.loc[row_index, column] = updated code.append( "df.loc[{row_index}, '{column}'] = {updated}".format( row_index=row_index, column=column, updated=updated_str ) ) else: classification = classify_type(dtype) if classification == "B": updated = updated.lower() == "true" updated_str = str(updated) elif classification == "I": updated = int(updated) elif classification == "F": updated = float(updated) elif classification == "D": updated_str = "pd.Timestamp({})".format(updated) updated = pd.Timestamp(updated) elif classification == "TD": updated_str = "pd.Timedelta({})".format(updated) updated = pd.Timedelta(updated) else: if dtype.startswith("category") and updated not in data[column].unique(): data[column].cat.add_categories(updated, inplace=True) code.append( "data['{column}'].cat.add_categories('{updated}', inplace=True)".format( column=column, updated=updated ) ) updated_str = "'{}'".format(updated) data.at[row_index, column] = updated code.append( "df.at[{row_index}, '{column}'] = {updated}".format( row_index=row_index, column=column, updated=updated_str ) ) curr_history = global_state.get_history(data_id) or [] curr_history += code global_state.set_history(data_id, curr_history) data = global_state.get_data(data_id) dtypes = global_state.get_dtypes(data_id) ranges = {} try: ranges[column] = data[[column]].agg(["min", "max"]).to_dict()[column] except ValueError: pass dtype_f = dtype_formatter(data, {column: dtype}, ranges) dtypes = [ dtype_f(dt["index"], column) if dt["name"] == column else dt for dt in dtypes ] global_state.set_dtypes(data_id, dtypes) return jsonify(success=True)
[docs]def build_filter_vals(series, data_id, column, fmt): dtype_info = get_dtype_info(data_id, column) vals = list(series.dropna().unique()) try: vals = sorted(vals) except BaseException: pass # if there are mixed values (EX: strings with ints) this fails if dtype_info["unique_ct"] > 500: # columns with too many unique values will need to use asynchronous loading, so for now we'll give the # first 5 values vals = vals[:5] vals = [fmt(v) for v in vals] return vals
[docs]@dtale.route("/column-filter-data/<data_id>/<column>") @exception_decorator def get_column_filter_data(data_id, column): s = global_state.get_data(data_id)[column] dtype = find_dtype(s) fmt = find_dtype_formatter(dtype) classification = classify_type(dtype) ret = dict(success=True, hasMissing=bool(s.isnull().any())) if classification not in ["S", "B"]: data_range = s.agg(["min", "max"]).to_dict() data_range = {k: fmt(v) for k, v in data_range.items()} ret = dict_merge(ret, data_range) if classification in ["S", "I", "B"]: ret["uniques"] = build_filter_vals(s, data_id, column, fmt) return jsonify(ret)
[docs]@dtale.route("/async-column-filter-data/<data_id>/<column>") @exception_decorator def get_async_column_filter_data(data_id, column): input = get_str_arg(request, "input") s = global_state.get_data(data_id)[column] dtype = find_dtype(s) fmt = find_dtype_formatter(dtype) vals = s[s.astype("str").str.startswith(input)] vals = [dict(value=fmt(v)) for v in sorted(vals.unique())[:5]] return jsonify(vals)
[docs]@dtale.route("/save-column-filter/<data_id>/<column>") @exception_decorator def save_column_filter(data_id, column): curr_filters = ColumnFilter( data_id, column, get_str_arg(request, "cfg") ).save_filter() return jsonify(success=True, currFilters=curr_filters)
[docs]@dtale.route("/data/<data_id>") @exception_decorator def get_data(data_id): """ :class:`flask:flask.Flask` route which returns current rows from DATA (based on scrollbar specs and saved settings) to front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param ids: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param sort: JSON string from flask.request.args['sort'] which is applied to DATA using the sort_values() or sort_index() function. Here is the JSON structure: [col1,dir1],[col2,dir2],....[coln,dirn] :return: JSON { results: [ {dtale_index: 1, col1: val1_1, ...,colN: valN_1}, ..., {dtale_index: N2, col1: val1_N2, ...,colN: valN_N2} ], columns: [{name: col1, dtype: 'int64'},...,{name: colN, dtype: 'datetime'}], total: N2, success: True/False } """ data = global_state.get_data(data_id) # this will check for when someone instantiates D-Tale programatically and directly alters the internal # state of the dataframe (EX: d.data['new_col'] = 'foo') curr_dtypes = [c["name"] for c in global_state.get_dtypes(data_id)] if any(c not in curr_dtypes for c in data.columns): data, _ = format_data(data) global_state.set_data(data_id, data) global_state.set_dtypes( data_id, build_dtypes_state(data, global_state.get_dtypes(data_id) or []) ) params = retrieve_grid_params(request) ids = get_json_arg(request, "ids") if ids is None: return jsonify({}) col_types = global_state.get_dtypes(data_id) curr_settings = global_state.get_settings(data_id) or {} f = grid_formatter(col_types, nan_display=curr_settings.get("nanDisplay", "nan")) if curr_settings.get("sort") != params.get("sort"): data = sort_df_for_grid(data, params) global_state.set_data(data_id, data) if params.get("sort") is not None: curr_settings = dict_merge(curr_settings, dict(sort=params["sort"])) else: curr_settings = {k: v for k, v in curr_settings.items() if k != "sort"} data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ignore_empty=True, ) global_state.set_settings(data_id, curr_settings) total = len(data) results = {} if total: for sub_range in ids: sub_range = list(map(int, sub_range.split("-"))) if len(sub_range) == 1: sub_df = data.iloc[sub_range[0] : sub_range[0] + 1] sub_df = f.format_dicts(sub_df.itertuples()) results[sub_range[0]] = dict_merge({IDX_COL: sub_range[0]}, sub_df[0]) else: [start, end] = sub_range sub_df = ( data.iloc[start:] if end >= len(data) - 1 else data.iloc[start : end + 1] ) sub_df = f.format_dicts(sub_df.itertuples()) for i, d in zip(range(start, end + 1), sub_df): results[i] = dict_merge({IDX_COL: i}, d) columns = [ dict(name=IDX_COL, dtype="int64", visible=True) ] + global_state.get_dtypes(data_id) return_data = dict(results=results, columns=columns, total=total) return jsonify(return_data)
[docs]@dtale.route("/data-export/<data_id>") @exception_decorator def data_export(data_id): curr_settings = global_state.get_settings(data_id) or {} curr_dtypes = global_state.get_dtypes(data_id) or [] data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ignore_empty=True, ) data = data[ [ c["name"] for c in sorted(curr_dtypes, key=lambda c: c["index"]) if c["visible"] ] ] tsv = get_str_arg(request, "tsv") == "true" file_ext = "tsv" if tsv else "csv" csv_buffer = export_to_csv_buffer(data, tsv=tsv) filename = build_chart_filename("data", ext=file_ext) return send_file(csv_buffer.getvalue(), filename, "text/{}".format(file_ext))
[docs]@dtale.route("/column-analysis/<data_id>") @exception_decorator def get_column_analysis(data_id): """ :class:`flask:flask.Flask` route which returns output from numpy.histogram/pd.value_counts to front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param col: string from flask.request.args['col'] containing name of a column in your dataframe :param type: string from flask.request.args['type'] to signify either a histogram or value counts :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param bins: the number of bins to display in your histogram, options on the front-end are 5, 10, 20, 50 :param top: the number of top values to display in your value counts, default is 100 :returns: JSON {results: DATA, desc: output from pd.DataFrame[col].describe(), success: True/False} """ def handle_top(df, top): if top is not None: top = int(top) return df[:top] if top > 0 else df[top:], top elif len(df) > 100: top = 100 return df[:top], top return df, len(df) col = get_str_arg(request, "col", "values") bins = get_int_arg(request, "bins", 20) ordinal_col = get_str_arg(request, "ordinalCol") ordinal_agg = get_str_arg(request, "ordinalAgg", "sum") category_col = get_str_arg(request, "categoryCol") category_agg = get_str_arg(request, "categoryAgg", "mean") data_type = get_str_arg(request, "type") curr_settings = global_state.get_settings(data_id) or {} query = build_query(data_id, curr_settings.get("query")) data = run_query( global_state.get_data(data_id), query, global_state.get_context_variables(data_id), ) selected_col = find_selected_column(data, col) cols = [selected_col] if ordinal_col is not None: cols.append(ordinal_col) if category_col is not None: cols.append(category_col) data = data[~pd.isnull(data[selected_col])][cols] code = build_code_export( data_id, imports="import numpy as np\nimport pandas as pd\n\n" ) dtype = find_dtype(data[selected_col]) classifier = classify_type(dtype) if data_type is None: data_type = "histogram" if classifier in ["F", "I"] else "value_counts" if data_type == "value_counts": hist = pd.value_counts(data[selected_col]).to_frame(name="data").sort_index() code.append( "chart = pd.value_counts(df[~pd.isnull(df['{col}'])]['{col}'])".format( col=selected_col ) ) if ordinal_col is not None: if ordinal_agg == "pctsum": ordinal_data = data.groupby(selected_col)[[ordinal_col]].sum() ordinal_data = ordinal_data / ordinal_data.sum() code.append( ( "ordinal_data = df.groupby('{col}')[['{ordinal}']].sum()\n" "ordinal_data = ordinal_data / ordinal_data.sum()" ).format(col=selected_col, ordinal=ordinal_col) ) else: ordinal_data = getattr( data.groupby(selected_col)[[ordinal_col]], ordinal_agg )() code.append( "ordinal_data = df.groupby('{col}')[['{ordinal}']].{agg}()".format( col=selected_col, ordinal=ordinal_col, agg=ordinal_agg ) ) hist["ordinal"] = ordinal_data hist = hist.sort_values("ordinal") code.append( ( "chart['ordinal'] = ordinal_data\n" "chart = chart.sort_values('ordinal')" ) ) hist.index.name = "labels" hist = hist.reset_index() hist, top = handle_top(hist, get_int_arg(request, "top")) col_types = grid_columns(hist) f = grid_formatter(col_types, nan_display=None) return_data = f.format_lists(hist) return_data["top"] = top elif data_type == "categories": aggs = ["count", "sum" if category_agg == "pctsum" else category_agg] hist = data.groupby(category_col)[[selected_col]].agg(aggs) hist.columns = hist.columns.droplevel(0) hist.columns = ["count", "data"] code.append( "chart = data.groupby('{cat}')[['{col}']].agg(['{aggs}'])".format( cat=category_col, col=selected_col, aggs="', '".join(aggs) ) ) if category_agg == "pctsum": hist["data"] = hist["data"] / hist["data"].sum() code.append( "chart.loc[:, -1] = chart[chart.columns[-1]] / chart[chart.columns[-1]].sum()" ) hist.index.name = "labels" hist = hist.reset_index() hist, top = handle_top(hist, get_int_arg(request, "top")) f = grid_formatter(grid_columns(hist), nan_display=None) return_data = f.format_lists(hist) return_data["top"] = top elif data_type == "histogram": hist_data, hist_labels = np.histogram(data, bins=bins) hist_data = [json_float(h) for h in hist_data] hist_labels = [ "{0:.1f}".format(lbl) for lbl in hist_labels[1:] ] # drop the first bin because of just a minimum code.append( "chart = np.histogram(df[~pd.isnull(df['{col}'])][['{col}']], bins={bins})".format( col=selected_col, bins=bins ) ) desc, desc_code = load_describe(data[selected_col]) code += desc_code return_data = dict(labels=hist_labels, data=hist_data, desc=desc) cols = global_state.get_dtypes(data_id) return jsonify( code="\n".join(code), query=query, cols=cols, dtype=dtype, chart_type=data_type, **return_data )
[docs]@dtale.route("/correlations/<data_id>") @exception_decorator def get_correlations(data_id): """ :class:`flask:flask.Flask` route which gathers Pearson correlations against all combinations of columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` On large datasets with no :attr:`numpy:numpy.nan` data this code will use :meth:`numpy:numpy.corrcoef` for speed purposes :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :returns: JSON { data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}], } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ curr_settings = global_state.get_settings(data_id) or {} data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ) valid_corr_cols = [] valid_date_cols = [] for col_info in global_state.get_dtypes(data_id): name, dtype = map(col_info.get, ["name", "dtype"]) dtype = classify_type(dtype) if dtype in ["I", "F"]: valid_corr_cols.append(name) elif dtype == "D": # even if a datetime column exists, we need to make sure that there is enough data for a date # to warrant a correlation, https://github.com/man-group/dtale/issues/43 date_counts = data[name].dropna().value_counts() if len(date_counts[date_counts > 1]) > 1: valid_date_cols.append(dict(name=name, rolling=False)) elif date_counts.eq(1).all(): valid_date_cols.append(dict(name=name, rolling=True)) if data[valid_corr_cols].isnull().values.any(): data = data.corr(method="pearson") code = build_code_export(data_id) code.append("corr_data = corr_data.corr(method='pearson')") else: # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow data = np.corrcoef(data[valid_corr_cols].values, rowvar=False) data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols) code = build_code_export( data_id, imports="import numpy as np\nimport pandas as pd\n\n" ) code.append( ( "corr_cols = [\n" "\t'{corr_cols}'\n" "]\n" "corr_data = np.corrcoef(df[corr_cols].values, rowvar=False)\n" "corr_data = pd.DataFrame(corr_data, columns=[corr_cols], index=[corr_cols])" ).format( corr_cols="'\n\t'".join( ["', '".join(chunk) for chunk in divide_chunks(valid_corr_cols, 8)] ) ) ) code.append( "corr_data.index.name = str('column')\ncorr_data = corr_data.reset_index()" ) code = "\n".join(code) data.index.name = str("column") data = data.reset_index() col_types = grid_columns(data) f = grid_formatter(col_types, nan_display=None) return jsonify( data=f.format_dicts(data.itertuples()), dates=valid_date_cols, code=code )
[docs]@dtale.route("/chart-data/<data_id>") @exception_decorator def get_chart_data(data_id): """ :class:`flask:flask.Flask` route which builds data associated with a chart.js chart :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param x: string from flask.request.args['x'] column to be used as x-axis of chart :param y: string from flask.request.args['y'] column to be used as y-axis of chart :param group: string from flask.request.args['group'] comma-separated string of columns to group chart data by :param agg: string from flask.request.args['agg'] points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :returns: JSON { data: { series1: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, series2: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, ..., seriesN: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, }, min: minY, max: maxY, } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ data = run_query( global_state.get_data(data_id), build_query(data_id, get_str_arg(request, "query")), global_state.get_context_variables(data_id), ) x = get_str_arg(request, "x") y = get_json_arg(request, "y") group_col = get_json_arg(request, "group") agg = get_str_arg(request, "agg") allow_duplicates = get_bool_arg(request, "allowDupes") window = get_int_arg(request, "rollingWin") comp = get_str_arg(request, "rollingComp") data, code = build_base_chart( data, x, y, group_col=group_col, agg=agg, allow_duplicates=allow_duplicates, rolling_win=window, rolling_comp=comp, ) data["success"] = True return jsonify(data)
[docs]@dtale.route("/correlations-ts/<data_id>") @exception_decorator def get_correlations_ts(data_id): """ :class:`flask:flask.Flask` route which returns timeseries of Pearson correlations of two columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :returns: JSON { data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99} } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ curr_settings = global_state.get_settings(data_id) or {} data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ) cols = get_str_arg(request, "cols") cols = json.loads(cols) [col1, col2] = cols date_col = get_str_arg(request, "dateCol") rolling = get_bool_arg(request, "rolling") rolling_window = get_int_arg(request, "rollingWindow") min_periods = get_int_arg(request, "minPeriods") code = build_code_export(data_id) if rolling: data = data[[date_col, col1, col2]].set_index(date_col) rolling_kwargs = {} if min_periods is not None: rolling_kwargs["min_periods"] = min_periods data = ( data[[col1, col2]] .rolling(rolling_window, **rolling_kwargs) .corr() .reset_index() ) data = data.dropna() data = data[data["level_1"] == col1][[date_col, col2]] code.append( ( "corr_ts = df[['{date_col}', '{col1}', '{col2}']].set_index('{date_col}')\n" "corr_ts = corr_ts[['{col1}', '{col2}']].rolling({rolling_window}, min_periods=min_periods).corr()\n" "corr_ts = corr_ts.reset_index().dropna()\n" "corr_ts = corr_ts[corr_ts['level_1'] == '{col1}'][['{date_col}', '{col2}']]" ).format( col1=col1, col2=col2, date_col=date_col, rolling_window=rolling_window ) ) else: data = data.groupby(date_col)[cols].corr(method="pearson") data.index.names = ["date", "column"] data = data.reset_index() data = data[data.column == col1][["date", col2]] code.append( ( "corr_ts = df.groupby('{date_col}')['{cols}'].corr(method='pearson')\n" "corr_ts.index.names = ['date', 'column']\n" "corr_ts = corr_ts[corr_ts.column == '{col1}'][['date', '{col2}']]\n" ).format(col1=col1, col2=col2, date_col=date_col, cols="', '".join(cols)) ) if rolling_window: data = data.set_index("date") rolling_kwargs = {} if min_periods is not None: rolling_kwargs["min_periods"] = min_periods data = ( data[[col2]] .rolling(rolling_window, **rolling_kwargs) .mean() .reset_index() ) data = data.dropna() code.append( ( "corr_ts = corr_ts.set_index('date')\n" "corr_ts = corr_ts[['{col}']].rolling({rolling_window}, min_periods={min_periods}).mean()\n" "corr_ts = corr_ts.reset_index().dropna()" ).format( col=col2, rolling_window=rolling_window, min_periods=min_periods ) ) data.columns = ["date", "corr"] code.append("corr_ts.columns = ['date', 'corr']") return_data, _code = build_base_chart(data.fillna(0), "date", "corr") return_data["success"] = True return_data["code"] = "\n".join(code) return jsonify(return_data)
[docs]@dtale.route("/scatter/<data_id>") @exception_decorator def get_scatter(data_id): """ :class:`flask:flask.Flask` route which returns data used in correlation of two columns for scatter chart :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :param date: string from flask.request.args['date'] date value in dateCol to filter dataframe to :returns: JSON { data: [{col1: 0.123, col2: 0.123, index: 1},...,{col1: 0.123, col2: 0.123, index: N}], stats: { stats: { correlated: 50, only_in_s0: 1, only_in_s1: 2, pearson: 0.987, spearman: 0.879, } x: col1, y: col2 } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ cols = get_json_arg(request, "cols") date = get_str_arg(request, "date") date_col = get_str_arg(request, "dateCol") rolling = get_bool_arg(request, "rolling") curr_settings = global_state.get_settings(data_id) or {} data = run_query( global_state.get_data(data_id), build_query(data_id, curr_settings.get("query")), global_state.get_context_variables(data_id), ) idx_col = str("index") y_cols = [cols[1], idx_col] code = build_code_export(data_id) if rolling: window = get_int_arg(request, "window") idx = min(data[data[date_col] == date].index) + 1 data = data.iloc[max(idx - window, 0) : idx] data = data[cols + [date_col]].dropna(how="any") y_cols.append(date_col) code.append( ( "idx = min(df[df['{date_col}'] == '{date}'].index) + 1\n" "scatter_data = scatter_data.iloc[max(idx - {window}, 0):idx]\n" "scatter_data = scatter_data['{cols}'].dropna(how='any')" ).format( date_col=date_col, date=date, window=window, cols="', '".join(sorted(list(set(cols)) + [date_col])), ) ) else: data = data[data[date_col] == date] if date else data data = data[cols].dropna(how="any") code.append( ( "scatter_data = df[df['{date_col}'] == '{date}']" if date else "scatter_data = df" ).format(date_col=date_col, date=date) ) code.append( "scatter_data = scatter_data['{cols}'].dropna(how='any')".format( cols="', '".join(cols) ) ) data[idx_col] = data.index [col1, col2] = cols s0 = data[col1] s1 = data[col2] pearson = s0.corr(s1, method="pearson") spearman = s0.corr(s1, method="spearman") stats = dict( pearson="N/A" if pd.isnull(pearson) else pearson, spearman="N/A" if pd.isnull(spearman) else spearman, correlated=len(data), only_in_s0=len(data[data[col1].isnull()]), only_in_s1=len(data[data[col2].isnull()]), ) code.append( ( "scatter_data['{idx_col}'] = scatter_data.index\n" "s0 = scatter_data['{col1}']\n" "s1 = scatter_data['{col2}']\n" "pearson = s0.corr(s1, method='pearson')\n" "spearman = s0.corr(s1, method='spearman')\n" "only_in_s0 = len(scatter_data[scatter_data['{col1}'].isnull()])\n" "only_in_s1 = len(scatter_data[scatter_data['{col2}'].isnull()])" ).format(col1=col1, col2=col2, idx_col=idx_col) ) if len(data) > 15000: return jsonify( stats=stats, code="\n".join(code), error="Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...", ) data, _code = build_base_chart(data, cols[0], y_cols, allow_duplicates=True) data["x"] = cols[0] data["y"] = cols[1] data["stats"] = stats data["code"] = "\n".join(code) return jsonify(data)
[docs]def build_context_variables(data_id, new_context_vars=None): """ Build and return the dictionary of context variables associated with a process. If the names of any new variables are not formatted properly, an exception will be raised. New variables will overwrite the values of existing variables if they share the same name. :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param new_context_vars: dictionary of name, value pairs for new context variables :type new_context_vars: dict, optional :returns: dict of the context variables for this process :rtype: dict """ if new_context_vars: for name, value in new_context_vars.items(): if not isinstance(name, string_types): raise SyntaxError( "{}, context variables must be a valid string".format(name) ) elif not name.replace("_", "").isalnum(): raise SyntaxError( "{}, context variables can only contain letters, digits, or underscores".format( name ) ) elif name.startswith("_"): raise SyntaxError( "{}, context variables can not start with an underscore".format( name ) ) return dict_merge(global_state.get_context_variables(data_id), new_context_vars)
[docs]@dtale.route("/filter-info/<data_id>") @exception_decorator def get_filter_info(data_id): """ :class:`flask:flask.Flask` route which returns a view-only version of the query, column filters & context variables to the front end. :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :return: JSON """ def value_as_str(value): """Convert values into a string representation that can be shown to the user in the front-end.""" return str(value)[:1000] ctxt_vars = global_state.get_context_variables(data_id) or {} ctxt_vars = [dict(name=k, value=value_as_str(v)) for k, v in ctxt_vars.items()] curr_settings = global_state.get_settings(data_id) or {} curr_settings = { k: v for k, v in curr_settings.items() if k in ["query", "columnFilters", "outlierFilters"] } return jsonify(contextVars=ctxt_vars, success=True, **curr_settings)
[docs]@dtale.route("/xarray-coordinates/<data_id>") @exception_decorator def get_xarray_coords(data_id): ds = global_state.get_dataset(data_id) coord_data = [ dict(name=coord, count=len(info), dtype=info.dtype.name) for coord, info in ds.coords.items() ] return jsonify(data=coord_data)
[docs]@dtale.route("/xarray-dimension-values/<data_id>/<dim>") @exception_decorator def get_xarray_dimension_values(data_id, dim): ds = global_state.get_dataset(data_id) dim = pd.DataFrame({"value": ds.coords[dim].data}) dim_f, _ = build_formatters(dim) return jsonify(data=dim_f.format_dicts(dim.itertuples()))
[docs]@dtale.route("/update-xarray-selection/<data_id>") @exception_decorator def update_xarray_selection(data_id): ds = global_state.get_dataset(data_id) selection = get_json_arg(request, "selection") or {} df = convert_xarray_to_dataset(ds, **selection) startup("", data=df, data_id=data_id, ignore_duplicate=True) global_state.set_dataset_dim(data_id, selection) return jsonify(success=True)
[docs]@dtale.route("/to-xarray/<data_id>") @exception_decorator def to_xarray(data_id): df = global_state.get_data(data_id) index_cols = get_json_arg(request, "index") ds = df.set_index(index_cols).to_xarray() startup("", data=ds, data_id=data_id, ignore_duplicate=True) curr_settings = global_state.get_settings(data_id) startup_code = "df = df.set_index(['{index}']).to_xarray()".format( index="', '".join(index_cols) ) global_state.set_settings( data_id, dict_merge(curr_settings, dict(startup_code=startup_code)) ) return jsonify(success=True)
[docs]@dtale.route("/code-export/<data_id>") @exception_decorator def get_code_export(data_id): code = build_code_export(data_id) return jsonify(code="\n".join(code), success=True)
[docs]def build_chart_filename(chart_type, ext="html"): return "{}_export_{}.{}".format( chart_type, json_timestamp(pd.Timestamp("now")), ext )
[docs]def send_file(output, filename, content_type): resp = make_response(output) resp.headers["Content-Disposition"] = "attachment; filename=%s" % filename resp.headers["Content-Type"] = content_type return resp
[docs]@dtale.route("/chart-export/<data_id>") @exception_decorator def chart_export(data_id): params = chart_url_params(request.args.to_dict()) html_str = export_chart(data_id, params) filename = build_chart_filename(params["chart_type"]) return send_file(html_str, filename, "text/html")
[docs]@dtale.route("/chart-csv-export/<data_id>") @exception_decorator def chart_csv_export(data_id): params = chart_url_params(request.args.to_dict()) csv_buffer = export_chart_data(data_id, params) filename = build_chart_filename(params["chart_type"], ext="csv") return send_file(csv_buffer.getvalue(), filename, "text/csv")
[docs]@dtale.route("/cleanup/<data_id>") @exception_decorator def run_cleanup(data_id): global_state.cleanup(data_id) return jsonify(success=True)
[docs]@dtale.route("/upload", methods=["POST"]) @exception_decorator def upload(): contents, filename = (request.form.get(p) for p in ["contents", "filename"]) if contents is None: raise Exception("No file data loaded!") _, content_string = contents.split(",") decoded = base64.b64decode(content_string) str_obj = StringIO(decoded.decode("utf-8")) df = pd.read_csv(str_obj) # TODO: handle un-named index columns... instance = startup("", data=df, ignore_duplicate=True) curr_settings = global_state.get_settings(instance._data_id) global_state.set_settings( instance._data_id, dict_merge( curr_settings, dict(startup_code="df = pd.read_csv('{}')".format(filename)) ), ) return jsonify(success=True, data_id=instance._data_id)