Skip to content

cmflib.cmf.Cmf.DataSlice

A data slice represents a named subset of data. It can be used to track performance of an ML model on different slices of the training or testing dataset splits. This can be useful from different perspectives, for instance, to mitigate model bias.

Instances of data slices are not meant to be created manually by users. Instead, use Cmf.create_dataslice method.

Source code in cmflib/cmf.py
1513
1514
1515
1516
def __init__(self, name: str, writer):
    self.props:dict[str, dict[str, str]] = {}
    self.name = name
    self.writer = writer

add_data(path, custom_properties=None)

Add data to create the dataslice. Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file should already be versioned. Example:

#dataslice.add_data(f"data/raw_data/{j}.xml)
Args: path: Name to identify the file to be added to the dataslice. custom_properties: Properties associated with this datum.

Source code in cmflib/cmf.py
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
def add_data(
    self, path: str, custom_properties: t.Optional[t.Dict] = None
) -> None:
    """Add data to create the dataslice.
    Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file
        should already be versioned.
    Example:
        ```python
        #dataslice.add_data(f"data/raw_data/{j}.xml)
        ```
    Args:
        path: Name to identify the file to be added to the dataslice.
        custom_properties: Properties associated with this datum.
    """

    self.props[path] = {}
    self.props[path]['hash'] = dvc_get_hash(path)
    parent_path = path.rsplit("/", 1)[0]
    self.data_parent = parent_path.rsplit("/", 1)[1]
    if custom_properties:
        for k, v in custom_properties.items():
            self.props[path][k] = v

commit(custom_properties=None)

Commit the dataslice. The created dataslice is versioned and added to underneath data versioning software. Example:

dataslice.commit()
```

Args: custom_properties: Dictionary to store key value pairs associated with Dataslice Example{"mean":2.5, "median":2.6}

Source code in cmflib/cmf.py
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
    """Commit the dataslice.
    The created dataslice is versioned and added to underneath data versioning software.
    Example:

        dataslice.commit()
        ```
    Args:
        custom_properties: Dictionary to store key value pairs associated with Dataslice
        Example{"mean":2.5, "median":2.6}
    """

    logging_dir = change_dir(self.writer.cmf_init_path)
    # code for nano cmf
    # Assigning current file name as stage and execution name
    current_script = sys.argv[0]
    file_name = os.path.basename(current_script)
    assigned_name = os.path.splitext(file_name)[0]
    # create context if not already created
    if not self.writer.child_context:
        self.writer.create_context(pipeline_stage=assigned_name)
        assert self.writer.child_context is not None, f"Failed to create context for {self.writer.pipeline_name}!!"

    # create execution if not already created
    if not self.writer.execution:
        self.writer.create_execution(execution_type=assigned_name)
        assert self.writer.execution is not None, f"Failed to create execution for {self.writer.pipeline_name}!!"

    directory_path = os.path.join(self.writer.ARTIFACTS_PATH, self.writer.execution.properties["Execution_uuid"].string_value.split(',')[0], self.writer.DATASLICE_PATH)
    os.makedirs(directory_path, exist_ok=True)
    custom_props = {} if custom_properties is None else custom_properties
    git_repo = git_get_repo()
    dataslice_df = pd.DataFrame.from_dict(self.props, orient="index")
    dataslice_df.index.names = ["Path"]
    dataslice_path = os.path.join(directory_path,self.name)
    dataslice_df.to_parquet(dataslice_path)
    existing_artifact = []

    commit_output(dataslice_path, self.writer.execution.id)
    c_hash = dvc_get_hash(dataslice_path)
    if c_hash == "":
        print("Error in getting the dvc hash,return without logging")
        return

    dataslice_commit = c_hash
    url = dvc_get_url(dataslice_path)
    dvc_url_with_pipeline = f"{self.writer.parent_context.name}:{url}"
    if c_hash and c_hash.strip():
        existing_artifact.extend(
            self.writer.store.get_artifacts_by_uri(c_hash))
    if existing_artifact and len(existing_artifact) != 0:
        print("Adding to existing data slice")
        # Haven't added event type in this if cond, is it not needed??
        slice = link_execution_to_input_artifact(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            uri=c_hash,
            input_name=dataslice_path + ":" + c_hash,
        )
    else:
        slice = create_new_artifact_event_and_attribution(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            context_id=self.writer.child_context.id,
            uri=c_hash,
            name=dataslice_path + ":" + c_hash,
            type_name="Dataslice",
            event_type=mlpb.Event.Type.OUTPUT,  # type: ignore  # Event type not recognized by mypy, using ignore to bypass
            properties={
                "git_repo": str(git_repo),
                # passing c_hash value to commit
                "Commit": str(dataslice_commit),
                "url": str(dvc_url_with_pipeline),
            },
            artifact_type_properties={
                "git_repo": mlpb.STRING,    # type: ignore  # String type not recognized by mypy, using ignore to bypass
                "Commit": mlpb.STRING,  # type: ignore  # String type not recognized by mypy, using ignore to bypass
                "url": mlpb.STRING, # type: ignore  # String type not recognized by mypy, using ignore to bypass
            },
            custom_properties=custom_props,
            milliseconds_since_epoch=int(time.time() * 1000),
        )

    custom_props["git_repo"] = git_repo
    custom_props["Commit"] = dataslice_commit
    self.writer.execution_label_props["git_repo"] = git_repo
    self.writer.execution_label_props["Commit"] = dataslice_commit
    if self.writer.graph:
        self.writer.driver.create_dataslice_node(
            self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props
        )
    os.chdir(logging_dir)
    return slice