python/pyspark/pandas/plot/plotly.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import inspect
from typing import TYPE_CHECKING, Union

import pandas as pd

from pyspark.pandas.plot import (
    HistogramPlotBase,
    name_like_string,
    PandasOnSparkPlotAccessor,
    BoxPlotBase,
    KdePlotBase,
)

if TYPE_CHECKING:
    import pyspark.pandas as ps


def plot_pandas_on_spark(data: Union["ps.DataFrame", "ps.Series"], kind: str, **kwargs):
    import plotly

    # pandas-on-Spark specific plots
    if kind == "pie":
        return plot_pie(data, **kwargs)
    if kind == "hist":
        return plot_histogram(data, **kwargs)
    if kind == "box":
        return plot_box(data, **kwargs)
    if kind == "kde" or kind == "density":
        return plot_kde(data, **kwargs)

    # Other plots.
    return plotly.plot(PandasOnSparkPlotAccessor.pandas_plot_data_map[kind](data), kind, **kwargs)


def plot_pie(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    from plotly import express

    data = PandasOnSparkPlotAccessor.pandas_plot_data_map["pie"](data)

    if isinstance(data, pd.Series):
        pdf = data.to_frame()
        return express.pie(pdf, values=pdf.columns[0], names=pdf.index, **kwargs)
    elif isinstance(data, pd.DataFrame):
        values = kwargs.pop("y", None)
        default_names = None
        if values is not None:
            default_names = data.index

        return express.pie(
            data,
            values=kwargs.pop("values", values),
            names=kwargs.pop("names", default_names),
            **kwargs,
        )
    else:
        raise RuntimeError("Unexpected type: [%s]" % type(data))


def plot_histogram(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    import plotly.graph_objs as go
    import pyspark.pandas as ps

    bins = kwargs.get("bins", 10)
    y = kwargs.get("y")
    if y and isinstance(data, ps.DataFrame):
        # Note that the results here are matched with matplotlib. x and y
        # handling is different from pandas' plotly output.
        data = data[y]
    psdf, bins = HistogramPlotBase.prepare_hist_data(data, bins)
    assert len(bins) > 2, "the number of buckets must be higher than 2."
    output_series = HistogramPlotBase.compute_hist(psdf, bins)
    prev = float("%.9f" % bins[0])  # to make it prettier, truncate.
    text_bins = []
    for b in bins[1:]:
        norm_b = float("%.9f" % b)
        text_bins.append("[%s, %s)" % (prev, norm_b))
        prev = norm_b
    text_bins[-1] = text_bins[-1][:-1] + "]"  # replace ) to ] for the last bucket.

    bins = 0.5 * (bins[:-1] + bins[1:])

    output_series = list(output_series)
    bars = []
    for series in output_series:
        bars.append(
            go.Bar(
                x=bins,
                y=series,
                name=name_like_string(series.name),
                text=text_bins,
                hovertemplate=(
                    "variable=" + name_like_string(series.name) + "<br>value=%{text}<br>count=%{y}"
                ),
            )
        )

    layout_keys = inspect.signature(go.Layout).parameters.keys()
    layout_kwargs = {k: v for k, v in kwargs.items() if k in layout_keys}

    fig = go.Figure(data=bars, layout=go.Layout(**layout_kwargs))
    fig["layout"]["barmode"] = "stack"
    fig["layout"]["xaxis"]["title"] = "value"
    fig["layout"]["yaxis"]["title"] = "count"
    return fig


def plot_box(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    import plotly.graph_objs as go
    import pyspark.pandas as ps
    from pyspark.sql.types import NumericType

    # 'whis' isn't actually an argument in plotly (but in matplotlib). But seems like
    # plotly doesn't expose the reach of the whiskers to the beyond the first and
    # third quartiles (?). Looks they use default 1.5.
    whis = kwargs.pop("whis", 1.5)
    # 'precision' is pandas-on-Spark specific to control precision for approx_percentile
    precision = kwargs.pop("precision", 0.01)

    # Plotly options
    boxpoints = kwargs.pop("boxpoints", "suspectedoutliers")
    notched = kwargs.pop("notched", False)
    if boxpoints not in ["suspectedoutliers", False]:
        raise ValueError(
            "plotly plotting backend does not support 'boxpoints' set to '%s'. "
            "Set to 'suspectedoutliers' or False." % boxpoints
        )
    if notched:
        raise ValueError(
            "plotly plotting backend does not support 'notched' set to '%s'. "
            "Set to False." % notched
        )

    fig = go.Figure()

    if isinstance(data, ps.Series):
        sdf = data._psdf._internal.resolved_copy.spark_frame
        spark_column_name = data._internal.spark_column_name_for(data._column_label)
        colnames = [spark_column_name]
    else:
        sdf = data._internal.resolved_copy.spark_frame
        colnames = []
        for column_label in data._internal.column_labels:
            if isinstance(data._internal.spark_type_for(column_label), NumericType):
                colnames.append(name_like_string(column_label))

    results = BoxPlotBase.compute_box(
        sdf,
        colnames,
        whis,
        precision,
        boxpoints is not None,
    )
    assert len(results) == len(colnames)

    if isinstance(data, ps.Series):
        colname = name_like_string(data.name)
        result = results[0]

        fig.add_trace(
            go.Box(
                name=colname,
                q1=[result["q1"]],
                median=[result["med"]],
                q3=[result["q3"]],
                mean=[result["mean"]],
                lowerfence=[result["lower_whisker"]],
                upperfence=[result["upper_whisker"]],
                y=[result["fliers"]] if result["fliers"] else None,
                boxpoints=boxpoints,
                notched=notched,
                **kwargs,  # this is for workarounds. Box takes different options from express.box.
            )
        )
        fig["layout"]["xaxis"]["title"] = colname

    else:
        for i, colname in enumerate(colnames):
            result = results[i]

            fig.add_trace(
                go.Box(
                    x=[i],
                    name=colname,
                    q1=[result["q1"]],
                    median=[result["med"]],
                    q3=[result["q3"]],
                    mean=[result["mean"]],
                    lowerfence=[result["lower_whisker"]],
                    upperfence=[result["upper_whisker"]],
                    y=[result["fliers"]] if result["fliers"] else None,
                    boxpoints=boxpoints,
                    notched=notched,
                    **kwargs,
                )
            )

    fig["layout"]["yaxis"]["title"] = "value"
    return fig


def plot_kde(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    from plotly import express
    import pyspark.pandas as ps

    if isinstance(data, ps.DataFrame) and "color" not in kwargs:
        kwargs["color"] = "names"

    psdf = KdePlotBase.prepare_kde_data(data)
    sdf = psdf._internal.spark_frame
    data_columns = psdf._internal.data_spark_columns
    ind = KdePlotBase.get_ind(sdf.select(*data_columns), kwargs.pop("ind", None))
    bw_method = kwargs.pop("bw_method", None)

    kde_cols = [
        KdePlotBase.compute_kde_col(
            input_col=psdf._internal.spark_column_for(label),
            ind=ind,
            bw_method=bw_method,
        ).alias(f"kde_{i}")
        for i, label in enumerate(psdf._internal.column_labels)
    ]
    kde_results = sdf.select(*kde_cols).first()

    pdf = pd.concat(
        [
            pd.DataFrame(
                {
                    "Density": kde_result,
                    "names": name_like_string(label),
                    "index": ind,
                }
            )
            for label, kde_result in zip(psdf._internal.column_labels, list(kde_results))
        ]
    )

    fig = express.line(pdf, x="index", y="Density", **kwargs)
    fig["layout"]["xaxis"]["title"] = None
    return fig