cmflib.cmfquery.CmfQuery¶

Bases: object

CMF Query communicates with the MLMD database and implements basic search and retrieval functionality.

This class has been designed to work with the CMF framework. CMF alters names of pipelines, stages and artifacts in various ways. This means that actual names in the MLMD database will be different from those originally provided by users via CMF API. When methods in this class accept name parameters, it is expected that values of these parameters are fully-qualified names of respective entities.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the MLMD database file.	`'mlmd'`

Source code in cmflib/cmfquery.py

def __init__(self, filepath: str = "mlmd", is_server=False) -> None:
    self.filepath = filepath
    temp_store: t.Union[PostgresStore, SqlliteStore]
    if is_server:
        config_dict = get_postgres_config()
        temp_store = PostgresStore(config_dict)
    else:
        temp_store = SqlliteStore({"filename": filepath})
    self.store = temp_store.connect()

`get_pipeline_names()` ¶

Return names of all pipelines.

Returns:

Type	Description
`List[str]`	List of all pipeline names.

Source code in cmflib/cmfquery.py

def get_pipeline_names(self) -> t.List[str]:
    """Return names of all pipelines.

    Returns:
        List of all pipeline names.
    """
    return [ctx.name for ctx in self._get_pipelines()]

`get_pipeline_id(pipeline_name)` ¶

Return pipeline identifier for the pipeline names pipeline_name.

Parameters:

Name	Type	Description	Default
`pipeline_name`	`str`	Name of the pipeline.	required

Returns: Pipeline identifier or -1 if one does not exist.

Source code in cmflib/cmfquery.py

def get_pipeline_id(self, pipeline_name: str) -> int:
    """Return pipeline identifier for the pipeline names `pipeline_name`.

    Args:
        pipeline_name: Name of the pipeline.
    Returns:
        Pipeline identifier or -1 if one does not exist.
    """
    pipeline: t.Optional[mlpb.Context] = self._get_pipeline(pipeline_name)  # type: ignore  # Context type not recognized by mypy, using ignore to bypass
    return -1 if not pipeline else pipeline.id

`get_pipeline_stages(pipeline_name)` ¶

Return list of pipeline stages for the pipeline with the given name.

Parameters:

Name	Type	Description	Default
`pipeline_name`	`str`	Name of the pipeline for which stages need to be returned. In CMF, there are no different pipelines with the same name.	required

Returns: List of stage names associated with the given pipeline.

Source code in cmflib/cmfquery.py

def get_pipeline_stages(self, pipeline_name: str) -> t.List[str]:
    """Return list of pipeline stages for the pipeline with the given name.

    Args:
        pipeline_name: Name of the pipeline for which stages need to be returned. In CMF, there are no different
            pipelines with the same name.
    Returns:
        List of stage names associated with the given pipeline.
    """
    stages:t.List[str] = []
    for pipeline in self._get_pipelines(pipeline_name):
        stages.extend(stage.name for stage in self._get_stages(pipeline.id))
    return stages

`get_all_exe_in_stage(stage_name)` ¶

Return list of all executions for the stage with the given name.

Parameters:

Name	Type	Description	Default
`stage_name`	`str`	Name of the stage. Before stages are recorded in MLMD, they are modified (e.g., pipeline name will become part of the stage name). So stage names from different pipelines will not collide.	required

Returns: List of executions for the given stage.

Source code in cmflib/cmfquery.py

def get_all_exe_in_stage(self, stage_name: str) -> t.List[mlpb.Execution]:  # type: ignore  # Execution type not recognized by mypy, using ignore to bypass
    """Return list of all executions for the stage with the given name.

    Args:
        stage_name: Name of the stage. Before stages are recorded in MLMD, they are modified (e.g., pipeline name
                    will become part of the stage name). So stage names from different pipelines will not collide.
    Returns:
        List of executions for the given stage.
    """
    for pipeline in self._get_pipelines():
        for stage in self._get_stages(pipeline.id):
            if stage.name == stage_name:
                return self.store.get_executions_by_context(stage.id)
    return []

`get_all_executions_by_ids_list(exe_ids)` ¶

Return executions for given execution ids list as a pandas data frame.

Parameters:

Name	Type	Description	Default
`exe_ids`	`List[int]`	List of execution identifiers.	required

Returns:

Type	Description
`DataFrame`	Data frame with all executions for the list of given execution identifiers.

Source code in cmflib/cmfquery.py

def get_all_executions_by_ids_list(self, exe_ids: t.List[int]) -> pd.DataFrame:
    """Return executions for given execution ids list as a pandas data frame.

    Args:
        exe_ids: List of execution identifiers.

    Returns:
        Data frame with all executions for the list of given execution identifiers.
    """

    df = pd.DataFrame()
    executions = self.store.get_executions_by_id(exe_ids)
    for exe in executions:
        d1 = self._transform_to_dataframe(exe)
        df = pd.concat([df, d1], sort=True, ignore_index=True)
    return df

`get_all_artifacts_by_context(pipeline_name)` ¶

Return artifacts for given pipeline name as a pandas data frame.

Parameters:

Name	Type	Description	Default
`pipeline_name`	`str`	Name of the pipeline.	required

Returns:

Type	Description
`DataFrame`	Data frame with all artifacts associated with given pipeline name.

Source code in cmflib/cmfquery.py

def get_all_artifacts_by_context(self, pipeline_name: str) -> pd.DataFrame:
    """Return artifacts for given pipeline name as a pandas data frame.

    Args:
        pipeline_name: Name of the pipeline.

    Returns:
        Data frame with all artifacts associated with given pipeline name.
    """
    df = pd.DataFrame()
    contexts = self.store.get_contexts_by_type("Parent_Context")
    context_id = self.get_pipeline_id(pipeline_name)
    for ctx in contexts:
        if ctx.id == context_id:
            child_contexts = self.store.get_children_contexts_by_context(ctx.id)
            for cc in child_contexts:
                artifacts = self.store.get_artifacts_by_context(cc.id)
                for art in artifacts:
                    d1 = self.get_artifact_df(art)
                    df = pd.concat([df, d1], sort=True, ignore_index=True)
    return df

`get_all_artifacts_by_ids_list(artifact_ids)` ¶

Return all artifacts for the given artifact ids list.

Parameters:

Name	Type	Description	Default
`artifact_ids`	`List[int]`	List of artifact identifiers	required

Returns:

Type	Description
`DataFrame`	Data frame with all artifacts for the given artifact ids list.

Source code in cmflib/cmfquery.py

def get_all_artifacts_by_ids_list(self, artifact_ids: t.List[int]) -> pd.DataFrame:
    """Return all artifacts for the given artifact ids list.

    Args:
        artifact_ids: List of artifact identifiers

    Returns:
        Data frame with all artifacts for the given artifact ids list.
    """
    df = pd.DataFrame()
    artifacts = self.store.get_artifacts_by_id(artifact_ids)
    for art in artifacts:
        d1 = self.get_artifact_df(art)
        df = pd.concat([df, d1], sort=True, ignore_index=True)
    return df

`get_all_executions_in_stage(stage_name)` ¶

Return executions of the given stage as pandas data frame. Args: stage_name: Stage name. See doc strings for the prev method. Returns: Data frame with all executions associated with the given stage.

Source code in cmflib/cmfquery.py

def get_all_executions_in_stage(self, stage_name: str) -> pd.DataFrame:
    """Return executions of the given stage as pandas data frame.
    Args:
        stage_name: Stage name. See doc strings for the prev method.
    Returns:
        Data frame with all executions associated with the given stage.
    """
    df = pd.DataFrame()
    for pipeline in self._get_pipelines():
        for stage in self._get_stages(pipeline.id):
            if stage.name == stage_name:
                for execution in self._get_executions(stage.id):
                    ex_as_df: pd.DataFrame = self._transform_to_dataframe(
                        execution, {"id": execution.id, "name": execution.name}
                    )
                    df = pd.concat([df, ex_as_df], sort=True, ignore_index=True)
    return df

`get_artifact_df(artifact, d=None)` ¶

Return artifact's data frame representation.

Parameters:

Name	Type	Description	Default
`artifact`	`Artifact`	MLMD entity representing artifact.	required
`d`	`Optional[Dict]`	Optional initial content for data frame.	`None`

Returns: A data frame with the single row containing attributes of this artifact.

Source code in cmflib/cmfquery.py

def get_artifact_df(self, artifact: mlpb.Artifact, d: t.Optional[t.Dict] = None) -> pd.DataFrame:   # type: ignore  # Artifact type not recognized by mypy, using ignore to bypass
    """Return artifact's data frame representation.

    Args:
        artifact: MLMD entity representing artifact.
        d: Optional initial content for data frame.
    Returns:
        A data frame with the single row containing attributes of this artifact.
    """
    if d is None:
        d = {}
    d.update(
        {
            "id": artifact.id,
            "type": self.store.get_artifact_types_by_id([artifact.type_id])[0].name,
            "uri": artifact.uri,
            "name": artifact.name,
            "create_time_since_epoch": artifact.create_time_since_epoch,
            "last_update_time_since_epoch": artifact.last_update_time_since_epoch,
        }
    )
    return self._transform_to_dataframe(artifact, d)

`get_all_artifacts()` ¶

Return names of all artifacts.

Returns:

Type	Description
`List[str]`	List of all artifact names.

Source code in cmflib/cmfquery.py

def get_all_artifacts(self) -> t.List[str]:
    """Return names of all artifacts.

    Returns:
        List of all artifact names.
    """
    return [artifact.name for artifact in self.store.get_artifacts()]

`get_artifact(name)` ¶

Return artifact's data frame representation using artifact name.

Parameters:

Name	Type	Description	Default
`name`	`str`	Artifact name.	required

Returns: Pandas data frame with one row containing attributes of this artifact.

Source code in cmflib/cmfquery.py

def get_artifact(self, name: str) -> t.Optional[pd.DataFrame]:
    """Return artifact's data frame representation using artifact name.

    Args:
        name: Artifact name.
    Returns:
        Pandas data frame with one row containing attributes of this artifact.
    """
    artifact: t.Optional[mlpb.Artifact] = self._get_artifact(name)  # type: ignore  # Artifact type not recognized by mypy, using ignore to bypass
    if artifact:
        return self.get_artifact_df(artifact)
    return None

`get_all_artifacts_for_execution(execution_id)` ¶

Return input and output artifacts for the given execution.

Parameters:

Name	Type	Description	Default
`execution_id`	`int`	Execution identifier.	required

Return: Data frame containing input and output artifacts for the given execution, one artifact per row.

Source code in cmflib/cmfquery.py

def get_all_artifacts_for_execution(self, execution_id: int) -> pd.DataFrame:
    """Return input and output artifacts for the given execution.

    Args:
        execution_id: Execution identifier.
    Return:
        Data frame containing input and output artifacts for the given execution, one artifact per row.
    """
    df = pd.DataFrame()
    for event in self.store.get_events_by_execution_ids([execution_id]):
        event_type = "INPUT" if event.type == mlpb.Event.Type.INPUT else "OUTPUT"   # type: ignore  # Event type not recognized by mypy, using ignore to bypass
        for artifact in self.store.get_artifacts_by_id([event.artifact_id]):
            df = pd.concat(
                [df, self.get_artifact_df(artifact, {"event": event_type})], sort=True, ignore_index=True
            )
    return df

`get_all_artifact_types()` ¶

Return names of all artifact types.

Returns:

Type	Description
`List[str]`	List of all artifact types.

Source code in cmflib/cmfquery.py

def get_all_artifact_types(self) -> t.List[str]:
    """Return names of all artifact types.

    Returns:
        List of all artifact types.
    """
    artifact_list = self.store.get_artifact_types()
    types=[i.name for i in artifact_list]
    return types

`get_all_executions_for_artifact(artifact_name)` ¶

Return executions that consumed and produced given artifact.

Parameters:

Name	Type	Description	Default
`artifact_name`	`str`	Artifact name.	required

Returns: Pandas data frame containing stage executions, one execution per row.

Source code in cmflib/cmfquery.py

def get_all_executions_for_artifact(self, artifact_name: str) -> pd.DataFrame:
    """Return executions that consumed and produced given artifact.

    Args:
        artifact_name: Artifact name.
    Returns:
        Pandas data frame containing stage executions, one execution per row.
    """
    df = pd.DataFrame()

    artifact: t.Optional[mlpb.Artifact] = self._get_artifact(artifact_name) # type: ignore  # Artifact type not recognized by mypy, using ignore to bypass
    if not artifact:
        return df

    for event in self.store.get_events_by_artifact_ids([artifact.id]):
        stage_ctx = self.store.get_contexts_by_execution(event.execution_id)[0]
        linked_execution = {
            "Type": "INPUT" if event.type == mlpb.Event.Type.INPUT else "OUTPUT",   # type: ignore  # Event type not recognized by mypy, using ignore to bypass
            "execution_id": event.execution_id,
            "execution_name": self.store.get_executions_by_id([event.execution_id])[0].name,
            "execution_type_name":self.store.get_executions_by_id([event.execution_id])[0].properties['Execution_type_name'],
            "stage": stage_ctx.name,
            "pipeline": self.store.get_parent_contexts_by_context(stage_ctx.id)[0].name,
        }
        d1 = pd.DataFrame(
            linked_execution,
            index=[
                0,
            ],
        )
        df = pd.concat([df, d1], sort=True, ignore_index=True)
    return df

`get_one_hop_child_artifacts(artifact_name, pipeline_id=None)` ¶

Get artifacts produced by executions that consume given artifact.

Parameters:

Name	Type	Description	Default
`artifact`	`name`	Name of an artifact.	required

Return: Output artifacts of all executions that consumed given artifact.

Source code in cmflib/cmfquery.py

def get_one_hop_child_artifacts(self, artifact_name: str, pipeline_id: t.Optional[int] = None) -> pd.DataFrame:
    """Get artifacts produced by executions that consume given artifact.

    Args:
        artifact name: Name of an artifact.
    Return:
        Output artifacts of all executions that consumed given artifact.
    """
    artifact: t.Optional[mlpb.Artifact] = self._get_artifact(artifact_name)    # type: ignore  # Artifact type not recognized by mypy, using ignore to bypass
    if not artifact:
        return pd.DataFrame()

    # Get output artifacts of executions consumed the above artifact.
    artifacts_ids = self._get_output_artifacts(self._get_executions_by_input_artifact_id(artifact.id, pipeline_id))
    return self._as_pandas_df(
        self.store.get_artifacts_by_id(artifacts_ids), lambda _artifact: self.get_artifact_df(_artifact)
    )

`get_all_child_artifacts(artifact_name)` ¶

Return all downstream artifacts starting from the given artifact.

Parameters:

Name	Type	Description	Default
`artifact_name`	`str`	Artifact name.	required

Returns: Data frame containing all child artifacts.

Source code in cmflib/cmfquery.py

def get_all_child_artifacts(self, artifact_name: str) -> pd.DataFrame:
    """Return all downstream artifacts starting from the given artifact.

    Args:
        artifact_name: Artifact name.
    Returns:
        Data frame containing all child artifacts.
    """
    df = pd.DataFrame()
    d1 = self.get_one_hop_child_artifacts(artifact_name)
    df = pd.concat([df, d1], sort=True, ignore_index=True)
    for row in d1.itertuples():
        d1 = self.get_all_child_artifacts(str(row.name))    # Convert row.name to string to ensure compatibility with get_all_child_artifacts method
        # df = df.append(d1, sort=True, ignore_index=True)
        df = pd.concat([df, d1], sort=True, ignore_index=True)
    df = df.drop_duplicates(subset=None, keep="first", inplace=False)
    return df

`get_one_hop_parent_artifacts(artifact_name)` ¶

Return input artifacts for the execution that produced the given artifact.

Parameters:

Name	Type	Description	Default
`artifact_name`	`str`	Artifact name.	required

Returns: Data frame containing immediate parent artifact of given artifact.

Source code in cmflib/cmfquery.py

def get_one_hop_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
    """Return input artifacts for the execution that produced the given artifact.

    Args:
        artifact_name: Artifact name.
    Returns:
        Data frame containing immediate parent artifact of given artifact.
    """
    artifact: t.Optional[mlpb.Artifact] = self._get_artifact(artifact_name) # type: ignore  # Artifact type not recognized by mypy, using ignore to bypass
    if not artifact:
        return pd.DataFrame()

    artifact_ids: t.List[int] = self._get_input_artifacts(self._get_executions_by_output_artifact_id(artifact.id))

    return self._as_pandas_df(
        self.store.get_artifacts_by_id(artifact_ids), lambda _artifact: self.get_artifact_df(_artifact)
    )

`get_all_parent_artifacts(artifact_name)` ¶

Return all upstream artifacts.

Parameters:

Name	Type	Description	Default
`artifact_name`	`str`	Artifact name.	required

Returns: Data frame containing all parent artifacts.

Source code in cmflib/cmfquery.py

def get_all_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
    """Return all upstream artifacts.

    Args:
        artifact_name: Artifact name.
    Returns:
        Data frame containing all parent artifacts.
    """
    df = pd.DataFrame()
    d1 = self.get_one_hop_parent_artifacts(artifact_name)
    df = pd.concat([df, d1], sort=True, ignore_index=True)
    for row in d1.itertuples():
        d1 = self.get_all_parent_artifacts(str(row.name))   # Convert row.name to string to ensure compatibility with get_all_parent_artifacts method
        # df = df.append(d1, sort=True, ignore_index=True)
        df = pd.concat([df, d1], sort=True, ignore_index=True)
    df = df.drop_duplicates(subset=None, keep="first", inplace=False)
    return df

`get_all_parent_executions(artifact_name)` ¶

Return all executions that produced upstream artifacts for the given artifact.

Parameters:

Name	Type	Description	Default
`artifact_name`	`str`	Artifact name.	required

Returns: Data frame containing all parent executions.

Source code in cmflib/cmfquery.py

def get_all_parent_executions(self, artifact_name: str) -> pd.DataFrame:
    """Return all executions that produced upstream artifacts for the given artifact.

    Args:
        artifact_name: Artifact name.
    Returns:
        Data frame containing all parent executions.
    """
    parent_artifacts: pd.DataFrame = self.get_all_parent_artifacts(artifact_name)
    if parent_artifacts.shape[0] == 0:
        # If it's empty, there's no `id` column and the code below raises an exception.
        return pd.DataFrame()

    execution_ids = set(
        event.execution_id
        for event in self.store.get_events_by_artifact_ids([int(id) for id in parent_artifacts.id.values.tolist()])
        if event.type == mlpb.Event.OUTPUT  # type: ignore  # Event type not recognized by mypy, using ignore to bypass
    )

    return self._as_pandas_df(
        self.store.get_executions_by_id(execution_ids),
        lambda _exec: self._transform_to_dataframe(_exec, {"id": _exec.id, "name": _exec.name}),
    )

`get_metrics(metrics_name)` ¶

Return metric data frame. Args: metrics_name: Metrics name. Returns: Data frame containing all metrics.

Source code in cmflib/cmfquery.py

def get_metrics(self, metrics_name: str) -> t.Optional[pd.DataFrame]:
    """Return metric data frame.
    Args:
        metrics_name: Metrics name.
    Returns:
        Data frame containing all metrics.
    """
    for metric in self.store.get_artifacts_by_type("Step_Metrics"):
        if metric.name == metrics_name:
            name: t.Optional[str] = metric.custom_properties.get("Name", None)
            if name:
                return pd.read_parquet(name)
            break
    return None

`dumptojson(pipeline_name, exec_uuid=None)` ¶

Return JSON-parsable string containing details about the given pipeline. Args: pipeline_name: Name of an AI pipelines. exec_uuid: Optional stage execution_uuid - filter stages by this execution_uuid. Returns: Pipeline in JSON format.

Source code in cmflib/cmfquery.py

def dumptojson(self, pipeline_name: str, exec_uuid: t.Optional[str] = None) -> t.Optional[str]:
    """Return JSON-parsable string containing details about the given pipeline.
    Args:
        pipeline_name: Name of an AI pipelines.
        exec_uuid: Optional stage execution_uuid - filter stages by this execution_uuid.
    Returns:
        Pipeline in JSON format.
    """
    pipelines: t.List[t.Dict] = []
    for pipeline in self._get_pipelines(pipeline_name):
        pipeline_attrs = self._get_node_attributes(pipeline, {"stages": self._get_stage_attributes(pipeline.id, exec_uuid)})
        pipelines.append(pipeline_attrs)

    return json.dumps({"Pipeline": pipelines})

cmflib.cmfquery.CmfQuery¶

get_pipeline_names() ¶

get_pipeline_id(pipeline_name) ¶

get_pipeline_stages(pipeline_name) ¶

get_all_exe_in_stage(stage_name) ¶

get_all_executions_by_ids_list(exe_ids) ¶

get_all_artifacts_by_context(pipeline_name) ¶

get_all_artifacts_by_ids_list(artifact_ids) ¶

get_all_executions_in_stage(stage_name) ¶

get_artifact_df(artifact, d=None) ¶

get_all_artifacts() ¶

get_artifact(name) ¶

get_all_artifacts_for_execution(execution_id) ¶

get_all_artifact_types() ¶

get_all_executions_for_artifact(artifact_name) ¶

get_one_hop_child_artifacts(artifact_name, pipeline_id=None) ¶

get_all_child_artifacts(artifact_name) ¶

get_one_hop_parent_artifacts(artifact_name) ¶

get_all_parent_artifacts(artifact_name) ¶

get_all_parent_executions(artifact_name) ¶

get_metrics(metrics_name) ¶

dumptojson(pipeline_name, exec_uuid=None) ¶

`get_pipeline_names()` ¶

`get_pipeline_id(pipeline_name)` ¶

`get_pipeline_stages(pipeline_name)` ¶

`get_all_exe_in_stage(stage_name)` ¶

`get_all_executions_by_ids_list(exe_ids)` ¶

`get_all_artifacts_by_context(pipeline_name)` ¶

`get_all_artifacts_by_ids_list(artifact_ids)` ¶

`get_all_executions_in_stage(stage_name)` ¶

`get_artifact_df(artifact, d=None)` ¶

`get_all_artifacts()` ¶

`get_artifact(name)` ¶

`get_all_artifacts_for_execution(execution_id)` ¶

`get_all_artifact_types()` ¶

`get_all_executions_for_artifact(artifact_name)` ¶

`get_one_hop_child_artifacts(artifact_name, pipeline_id=None)` ¶

`get_all_child_artifacts(artifact_name)` ¶

`get_one_hop_parent_artifacts(artifact_name)` ¶

`get_all_parent_artifacts(artifact_name)` ¶

`get_all_parent_executions(artifact_name)` ¶

`get_metrics(metrics_name)` ¶

`dumptojson(pipeline_name, exec_uuid=None)` ¶