From 7ecee3878d5d39abe918335fa96e9b40a31a156e Mon Sep 17 00:00:00 2001 From: Naoki Kanazawa Date: Wed, 15 Nov 2023 05:13:28 +0900 Subject: [PATCH] Refactoring - Remove context of fit model from data name and index; model_name -> name, model_id -> class_id - Remove extra metadata from the scatter table --- .../curve_analysis/base_curve_analysis.py | 10 +- .../composite_curve_analysis.py | 60 +++--- .../curve_analysis/curve_analysis.py | 179 +++++++----------- .../curve_analysis/scatter_table.py | 29 ++- ...dataframe-curve-data-a8905c450748b281.yaml | 4 + 5 files changed, 124 insertions(+), 158 deletions(-) diff --git a/qiskit_experiments/curve_analysis/base_curve_analysis.py b/qiskit_experiments/curve_analysis/base_curve_analysis.py index 51fd9d29b2..e11cb63cc9 100644 --- a/qiskit_experiments/curve_analysis/base_curve_analysis.py +++ b/qiskit_experiments/curve_analysis/base_curve_analysis.py @@ -188,6 +188,7 @@ def _default_options(cls) -> Options: lmfit_options (Dict[str, Any]): Options that are passed to the LMFIT minimizer. Acceptable options depend on fit_method. x_key (str): Circuit metadata key representing a scanned value. + fit_category (str): Name of dataset in the scatter table to fit. result_parameters (List[Union[str, ParameterRepr]): Parameters reported in the database as a dedicated entry. This is a list of parameter representation which is either string or ParameterRepr object. If you provide more @@ -219,6 +220,7 @@ def _default_options(cls) -> Options: options.normalization = False options.average_method = "shots_weighted" options.x_key = "xval" + options.fit_category = "formatted" options.result_parameters = [] options.extra = {} options.fit_method = "least_squares" @@ -282,11 +284,13 @@ def set_options(self, **fields): def _run_data_processing( self, raw_data: List[Dict], + category: str = "raw", ) -> ScatterTable: """Perform data processing from the experiment result payload. Args: raw_data: Payload in the experiment data. + category: Category string of the output dataset. Returns: Processed data that will be sent to the formatter method. @@ -296,14 +300,16 @@ def _run_data_processing( def _format_data( self, curve_data: ScatterTable, + category: str = "formatted", ) -> ScatterTable: - """Postprocessing for the processed dataset. + """Postprocessing for preparing the fitting data. Args: curve_data: Processed dataset created from experiment results. + category: Category string of the output dataset. Returns: - Formatted data. + New scatter table instance including fit data. """ @abstractmethod diff --git a/qiskit_experiments/curve_analysis/composite_curve_analysis.py b/qiskit_experiments/curve_analysis/composite_curve_analysis.py index 6232eda210..77dad880d8 100644 --- a/qiskit_experiments/curve_analysis/composite_curve_analysis.py +++ b/qiskit_experiments/curve_analysis/composite_curve_analysis.py @@ -230,32 +230,32 @@ def _create_figures( A list of figures. """ for analysis in self.analyses(): - sub_data = curve_data[curve_data.model_name.str.endswith(f"_{analysis.name}")] - for model_id, data in list(sub_data.groupby("model_id")): - model_name = analysis._models[model_id]._name + sub_data = curve_data[curve_data.group == analysis.name] + for name, data in list(sub_data.groupby("name")): + full_name = f"{name}_{analysis.name}" # Plot raw data scatters if analysis.options.plot_raw_data: - raw_data = data.filter(like="processed", axis="index") + raw_data = data[data.category == "raw"] self.plotter.set_series_data( - series_name=model_name, + series_name=full_name, x=raw_data.xval.to_numpy(), y=raw_data.yval.to_numpy(), ) # Plot formatted data scatters - formatted_data = data.filter(like="formatted", axis="index") + formatted_data = data[data.category == analysis.options.fit_category] self.plotter.set_series_data( - series_name=model_name, + series_name=full_name, x_formatted=formatted_data.xval.to_numpy(), y_formatted=formatted_data.yval.to_numpy(), y_formatted_err=formatted_data.yerr.to_numpy(), ) # Plot fit lines - line_data = data.filter(like="fitted", axis="index") + line_data = data[data.category == "fitted"] if len(line_data) == 0: continue fit_stdev = line_data.yerr.to_numpy() self.plotter.set_series_data( - series_name=model_name, + series_name=full_name, x_interp=line_data.xval.to_numpy(), y_interp=line_data.yval.to_numpy(), y_interp_err=fit_stdev if np.isfinite(fit_stdev).all() else None, @@ -353,10 +353,9 @@ def _run_analysis( metadata = analysis.options.extra.copy() metadata["group"] = analysis.name - curve_data = analysis._format_data( - analysis._run_data_processing(experiment_data.data()) - ) - fit_data = analysis._run_curve_fit(curve_data.filter(like="formatted", axis="index")) + table = analysis._format_data(analysis._run_data_processing(experiment_data.data())) + formatted_subset = table[table.category == analysis.options.fit_category] + fit_data = analysis._run_curve_fit(formatted_subset) fit_dataset[analysis.name] = fit_data if fit_data.success: @@ -364,10 +363,6 @@ def _run_analysis( else: quality = "bad" - # After the quality is determined, plot can become a boolean flag for whether - # to generate the figure - plot_bool = plot == "always" or (plot == "selective" and quality == "bad") - if self.options.return_fit_parameters: # Store fit status overview entry regardless of success. # This is sometime useful when debugging the fitting code. @@ -382,10 +377,9 @@ def _run_analysis( if fit_data.success: # Add fit data to curve data table fit_curves = [] - formatted = curve_data.filter(like="formatted", axis="index") - columns = list(curve_data.columns) - for i, sub_data in list(formatted.groupby("model_id")): - name = analysis._models[i]._name + columns = list(table.columns) + model_names = analysis.model_names() + for i, sub_data in list(formatted_subset.groupby("class_id")): xval = sub_data.xval.to_numpy() if len(xval) == 0: # If data is empty, skip drawing this model. @@ -404,12 +398,10 @@ def _run_analysis( model_fit[:, columns.index("yval")] = unp.nominal_values(yval_fit) if fit_data.covar is not None: model_fit[:, columns.index("yerr")] = unp.std_devs(yval_fit) - model_fit[:, columns.index("model_name")] = name - model_fit[:, columns.index("model_id")] = i - curve_data = curve_data.append_list_values( - other=np.vstack(fit_curves), - prefix="fitted", - ) + model_fit[:, columns.index("name")] = model_names[i] + model_fit[:, columns.index("class_id")] = i + model_fit[:, columns.index("category")] = "fitted" + table = table.append_list_values(other=np.vstack(fit_curves)) analysis_results.extend( analysis._create_analysis_results( fit_data=fit_data, @@ -421,18 +413,20 @@ def _run_analysis( if self.options.return_data_points: # Add raw data points analysis_results.extend( - analysis._create_curve_data( - curve_data=curve_data.filter(like="formatted", axis="index"), - **metadata, - ) + analysis._create_curve_data(curve_data=formatted_subset, **metadata) ) - curve_data.model_name += f"_{analysis.name}" - curve_data_set.append(curve_data) + # Add extra column to identify the fit model + table["group"] = analysis.name + curve_data_set.append(table) combined_curve_data = pd.concat(curve_data_set) total_quality = self._evaluate_quality(fit_dataset) + # After the quality is determined, plot can become a boolean flag for whether + # to generate the figure + plot_bool = plot == "always" or (plot == "selective" and total_quality == "bad") + # Create analysis results by combining all fit data if all(fit_data.success for fit_data in fit_dataset.values()): composite_results = self._create_analysis_results( diff --git a/qiskit_experiments/curve_analysis/curve_analysis.py b/qiskit_experiments/curve_analysis/curve_analysis.py index 7fec75b0b4..c0a16d6544 100644 --- a/qiskit_experiments/curve_analysis/curve_analysis.py +++ b/qiskit_experiments/curve_analysis/curve_analysis.py @@ -147,22 +147,27 @@ def models(self) -> List[lmfit.Model]: """Return fit models.""" return self._models + def model_names(self) -> List[str]: + """Return model names.""" + return [getattr(m, "_name", f"model-{i}") for i, m in enumerate(self._models)] + def _run_data_processing( self, raw_data: List[Dict], + category: str = "raw", ) -> ScatterTable: """Perform data processing from the experiment result payload. Args: raw_data: Payload in the experiment data. + category: Category string of the output dataset. Returns: Processed data that will be sent to the formatter method. Raises: DataProcessorError: When key for x values is not found in the metadata. - DataProcessorError: When data_subfit_map information for a fit model is missing. - ValueError: When input data has incomplete metadata to specify fit model. + ValueError: When data processor is not provided. """ opt = self.options @@ -178,64 +183,46 @@ def _run_data_processing( ("xval", float), ("yval", float), ("yerr", float), - ("model_name", "U30"), # builtin str is U0 which is zero-length unicode in numpy - ("model_id", int), + ("name", "U30"), + ("class_id", int), + ("category", "U30"), ("shots", int), ] ) - table_data = np.empty(len(to_process), dtype=dtypes) + + # Prepare circuit metadata to data class mapper from data_subfit_map value. + model_names = self.model_names() + classifier = {} + if len(model_names) == 1: + classifier[(0, model_names[0])] = {} + else: + for i, name in enumerate(model_names): + try: + spec = self.options.data_subfit_map[name] + except KeyError as ex: + raise DataProcessorError( + f"Mapping to data for the fit model {name} is not provided." + ) from ex + classifier[(i, name)] = spec + + source = np.empty(len(to_process), dtype=dtypes) for idx, datum in enumerate(to_process): - metadata = datum["metadata"].copy() - # Get xval from metadata + metadata = datum["metadata"] try: - xval = metadata.pop(opt.x_key) + xval = metadata[opt.x_key] except KeyError as ex: raise DataProcessorError( f"X value key {opt.x_key} is not defined in the circuit metadata." ) from ex - # Classify fit model - if len(self._models) == 1: - m_id = 0 - m_name = self._models[0]._name - else: - for i, model in enumerate(self._models): - try: - model_spec = self.options.data_subfit_map[model._name] - except KeyError as ex: - raise DataProcessorError( - f"Mapping to data for the fit model {model._name} is not provided." - ) from ex - if model_spec.items() <= metadata.items(): - m_id = i - m_name = model._name - break - else: - raise ValueError(f"Experiment data {datum} doesn't belong to any fit model.") - table_data[idx]["xval"] = xval - table_data[idx]["shots"] = datum.get("shots", -1) - table_data[idx]["model_id"] = m_id - table_data[idx]["model_name"] = m_name - - # Add extra metadata - add_key = metadata.keys() - table_data.dtype.fields - if add_key: - # Add missing keys to struct array - # This code is lengthy but faster than merge_arrays function - add_dtypes = [] - for k in add_key: - if isinstance(metadata[k], str): - new_dtype = "U30" - else: - new_dtype = type(metadata[k]) - add_dtypes.append((k, new_dtype)) - new_table_data = np.empty( - len(to_process), dtype=sum((table_data.dtype.descr, add_dtypes), []) - ) - for k in table_data.dtype.fields: - new_table_data[k] = table_data[k] - table_data = new_table_data - for k, v in metadata.items(): - table_data[idx][k] = v + source[idx]["xval"] = xval + source[idx]["shots"] = datum.get("shots", -1) + + # Assign entry name and class id + for (class_id, name), spec in classifier.items(): + if spec.items() <= metadata.items(): + source[idx]["class_id"] = class_id + source[idx]["name"] = name + break # Compute y value if not self.options.data_processor: @@ -245,26 +232,25 @@ def _run_data_processing( "data_processor analysis options." ) processed_values = self.options.data_processor(to_process) - table_data["yval"] = unp.nominal_values(processed_values).flatten() - table_data["yerr"] = unp.std_devs(processed_values).flatten() + source["yval"] = unp.nominal_values(processed_values).flatten() + source["yerr"] = unp.std_devs(processed_values).flatten() + source["category"] = category - out = ScatterTable( - data=table_data, - index=[f"processed-{i:04d}" for i in range(len(to_process))], - ) - return out + return ScatterTable(data=source) def _format_data( self, curve_data: ScatterTable, + category: str = "formatted", ) -> ScatterTable: - """Postprocessing for the processed dataset. + """Postprocessing for preparing the fitting data. Args: curve_data: Processed dataset created from experiment results. + category: Category string of the output dataset. Returns: - New scatter table instance including raw and formatted data. + New scatter table instance including fit data. """ averaging_methods = { "shots_weighted": shot_weighted_average, @@ -274,39 +260,27 @@ def _format_data( columns = list(curve_data.columns) sort_by = itemgetter( - columns.index("model_id"), + columns.index("class_id"), columns.index("xval"), ) # Use python native groupby method on ndarray. This is more performant than pandas one. average = averaging_methods[self.options.average_method] formatted = [] - for (mid, xv), g in groupby(sorted(curve_data.values, key=sort_by), key=sort_by): + for (class_id, xv), g in groupby(sorted(curve_data.values, key=sort_by), key=sort_by): g_values = np.array(list(g)) g_dict = dict(zip(columns, g_values.T)) avg_yval, avg_yerr, shots = average(g_dict["yval"], g_dict["yerr"], g_dict["shots"]) averaged = dict.fromkeys(columns) + averaged["category"] = category averaged["xval"] = xv averaged["yval"] = avg_yval averaged["yerr"] = avg_yerr - averaged["model_id"] = mid + averaged["name"] = g_dict["name"][0] + averaged["class_id"] = class_id averaged["shots"] = shots - for k, v in g_dict.items(): - if averaged[k] is not None: - continue - if len(g_values) == 1: - averaged[k] = v[0] - else: - unique = set(v) - if len(unique) == 1: - averaged[k] = next(iter(unique)) - else: - averaged[k] = list(unique) formatted.append(list(averaged.values())) - return curve_data.append_list_values( - other=formatted, - prefix="formatted", - ) + return curve_data.append_list_values(formatted) def _generate_fit_guesses( self, @@ -375,7 +349,7 @@ def _run_curve_fit( # Create convenient function to compute residual of the models. partial_residuals = [] valid_uncertainty = np.all(np.isfinite(curve_data.yerr.to_numpy())) - for i, sub_data in list(curve_data.groupby("model_id")): + for i, sub_data in list(curve_data.groupby("class_id")): if valid_uncertainty: nonzero_yerr = np.where( np.isclose(sub_data.yerr, 0.0), @@ -453,37 +427,36 @@ def _create_figures( Returns: A list of figures. """ - for model_id, data in list(curve_data.groupby("model_id")): - model_name = self._models[model_id]._name + for name, data in list(curve_data.groupby("name")): # Plot raw data scatters if self.options.plot_raw_data: - raw_data = data.filter(like="processed", axis="index") + raw_data = data[data.category == "raw"] self.plotter.set_series_data( - series_name=model_name, + series_name=name, x=raw_data.xval.to_numpy(), y=raw_data.yval.to_numpy(), ) # Plot formatted data scatters - formatted_data = data.filter(like="formatted", axis="index") + formatted_data = data[data.category == self.options.fit_category] self.plotter.set_series_data( - series_name=model_name, + series_name=name, x_formatted=formatted_data.xval.to_numpy(), y_formatted=formatted_data.yval.to_numpy(), y_formatted_err=formatted_data.yerr.to_numpy(), ) # Plot fit lines - line_data = data.filter(like="fitted", axis="index") + line_data = data[data.category == "fitted"] if len(line_data) == 0: continue self.plotter.set_series_data( - series_name=model_name, + series_name=name, x_interp=line_data.xval.to_numpy(), y_interp=line_data.yval.to_numpy(), ) fit_stdev = line_data.yerr.to_numpy() if np.isfinite(fit_stdev).all(): self.plotter.set_series_data( - series_name=model_name, + series_name=name, y_interp_err=fit_stdev, ) @@ -508,8 +481,9 @@ def _run_analysis( # Prepare for fitting self._initialize(experiment_data) - curve_data = self._format_data(self._run_data_processing(experiment_data.data())) - fit_data = self._run_curve_fit(curve_data.filter(like="formatted", axis="index")) + table = self._format_data(self._run_data_processing(experiment_data.data())) + formatted_subset = table[table.category == self.options.fit_category] + fit_data = self._run_curve_fit(formatted_subset) if fit_data.success: quality = self._evaluate_quality(fit_data) @@ -534,10 +508,9 @@ def _run_analysis( if fit_data.success: # Add fit data to curve data table fit_curves = [] - formatted = curve_data.filter(like="formatted", axis="index") - columns = list(curve_data.columns) - for i, sub_data in list(formatted.groupby("model_id")): - name = self._models[i]._name + columns = list(table.columns) + model_names = self.model_names() + for i, sub_data in list(formatted_subset.groupby("class_id")): xval = sub_data.xval.to_numpy() if len(xval) == 0: # If data is empty, skip drawing this model. @@ -556,12 +529,10 @@ def _run_analysis( model_fit[:, columns.index("yval")] = unp.nominal_values(yval_fit) if fit_data.covar is not None: model_fit[:, columns.index("yerr")] = unp.std_devs(yval_fit) - model_fit[:, columns.index("model_name")] = name - model_fit[:, columns.index("model_id")] = i - curve_data = curve_data.append_list_values( - other=np.vstack(fit_curves), - prefix="fitted", - ) + model_fit[:, columns.index("name")] = model_names[i] + model_fit[:, columns.index("class_id")] = i + model_fit[:, columns.index("category")] = "fitted" + table = table.append_list_values(other=np.vstack(fit_curves)) analysis_results.extend( self._create_analysis_results( fit_data=fit_data, @@ -572,11 +543,7 @@ def _run_analysis( if self.options.return_data_points: # Add raw data points - analysis_results.extend( - self._create_curve_data( - curve_data=curve_data.filter(like="formatted", axis="index"), - ) - ) + analysis_results.extend(self._create_curve_data(curve_data=formatted_subset)) if plot_bool: if fit_data.success: @@ -584,7 +551,7 @@ def _run_analysis( fit_red_chi=fit_data.reduced_chisq, primary_results=[r for r in analysis_results if not r.name.startswith("@")], ) - figures.extend(self._create_figures(curve_data=curve_data)) + figures.extend(self._create_figures(curve_data=table)) return analysis_results, figures diff --git a/qiskit_experiments/curve_analysis/scatter_table.py b/qiskit_experiments/curve_analysis/scatter_table.py index 7d16cedd44..8f573226a7 100644 --- a/qiskit_experiments/curve_analysis/scatter_table.py +++ b/qiskit_experiments/curve_analysis/scatter_table.py @@ -33,14 +33,15 @@ class ScatterTable(pd.DataFrame, DefaultColumnsMixIn): for the base class API documentation. A single ``ScatterTable`` object can contain different kind of intermediate data - generated through the curve fitting, which are categorized by the fit model. - When an experiment has sub-data for ``model_abc``, the formatted x, y, and y-error + generated through the curve fitting, which are classified by the fit model. + When an experiment has sub-data for ``sub_exp_1``, the formatted x, y, and y-error array data may be obtained from the original table object as follows: .. code-block::python - formatted = table.filter(like="formatted", axis="index") - abc_data = formatted[formatted.model_name == "model_abc"] + abc_data = table[ + (table.name == "sub_exp_1") & (table.category == "formatted") + ] x, y, e = abc_data.xval.to_numpy(), abc_data.yval.to_numpy(), abc_data.yerr.to_numpy() """ @@ -55,8 +56,9 @@ def _default_columns(cls) -> List[str]: "xval", "yval", "yerr", - "model_name", - "model_id", + "name", + "class_id", + "category", "shots", ] @@ -77,7 +79,7 @@ def get_subset_of(self, index: Union[str, int]) -> "ScatterTable": """ if isinstance(index, int): index = self.labels[index] - return self[self.model_name == index] + return self[self.name == index] @property @deprecate_func( @@ -138,7 +140,7 @@ def shots(self): def data_allocation(self) -> np.ndarray: """Index of corresponding fit model.""" # pylint: disable=no-member - return self.model_id.to_numpy() + return self.class_id.to_numpy() @property @deprecate_func( @@ -151,29 +153,22 @@ def data_allocation(self) -> np.ndarray: def labels(self) -> List[str]: """List of model names.""" # Order sensitive - name_id_tups = self.groupby(["model_name", "model_id"]).groups.keys() + name_id_tups = self.groupby(["name", "class_id"]).groups.keys() return [k[0] for k in sorted(name_id_tups, key=lambda k: k[1])] def append_list_values( self, other: Sequence, - prefix: str, ) -> "ScatterTable": """Add another list of dataframe values to this dataframe. Args: other: List of dataframe values to be added. - prefix: Prefix of row labels of the added values. Returns: New scatter table instance including both self and added data. """ - other_index = [f"{prefix}-{i:04d}" for i in range(len(other))] - return ScatterTable( - data=[*self.values, *other], - columns=self.columns, - index=[*self.index, *other_index], - ) + return ScatterTable(data=[*self.values, *other], columns=self.columns) def __json_encode__(self) -> Dict[str, Any]: return { diff --git a/releasenotes/notes/add-dataframe-curve-data-a8905c450748b281.yaml b/releasenotes/notes/add-dataframe-curve-data-a8905c450748b281.yaml index 7af3b7320f..912b53860c 100644 --- a/releasenotes/notes/add-dataframe-curve-data-a8905c450748b281.yaml +++ b/releasenotes/notes/add-dataframe-curve-data-a8905c450748b281.yaml @@ -8,6 +8,10 @@ features: and the table contains all data points generated by the :class:`.CurveAnalysis`. All properties and methods of :class:`.CurveData` are implemented for backward compatibility, but these will be removed in the future release. + - | + New analysis option ``fit_category`` is added to :class:`.Curveanalysis` subclasses. + This option controls which data subset within the :class:`.ScatterTable` + is used for the curve fitting. developer: - | :meth:`.CurveAnalysis._create_figures` method is added to the curve analysis base class.