Skip to content

cmflib.cmf.Cmf.DataSlice

A data slice represents a named subset of data. It can be used to track performance of an ML model on different slices of the training or testing dataset splits. This can be useful from different perspectives, for instance, to mitigate model bias.

Instances of data slices are not meant to be created manually by users. Instead, use Cmf.create_dataslice method.

Source code in cmflib/cmf.py
1398
1399
1400
1401
def __init__(self, name: str, writer):
    self.props:dict[str, dict[str, str]] = {}
    self.name = name
    self.writer = writer

add_data(path, custom_properties=None)

Add data to create the dataslice. Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file should already be versioned. Example:

#dataslice.add_data(f"data/raw_data/{j}.xml)
Args: path: Name to identify the file to be added to the dataslice. custom_properties: Properties associated with this datum.

Source code in cmflib/cmf.py
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
def add_data(
    self, path: str, custom_properties: t.Optional[t.Dict] = None
) -> None:
    """Add data to create the dataslice.
    Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file
        should already be versioned.
    Example:
        ```python
        #dataslice.add_data(f"data/raw_data/{j}.xml)
        ```
    Args:
        path: Name to identify the file to be added to the dataslice.
        custom_properties: Properties associated with this datum.
    """

    self.props[path] = {}
    self.props[path]['hash'] = dvc_get_hash(path)
    parent_path = path.rsplit("/", 1)[0]
    self.data_parent = parent_path.rsplit("/", 1)[1]
    if custom_properties:
        for k, v in custom_properties.items():
            self.props[path][k] = v

commit(custom_properties=None)

Commit the dataslice. The created dataslice is versioned and added to underneath data versioning software. Example:

dataslice.commit()
```

Args: custom_properties: Dictionary to store key value pairs associated with Dataslice Example{"mean":2.5, "median":2.6}

Source code in cmflib/cmf.py
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
    """Commit the dataslice.
    The created dataslice is versioned and added to underneath data versioning software.
    Example:

        dataslice.commit()
        ```
    Args:
        custom_properties: Dictionary to store key value pairs associated with Dataslice
        Example{"mean":2.5, "median":2.6}
    """

    logging_dir = change_dir(self.writer.cmf_init_path)
    # code for nano cmf
    # Assigning current file name as stage and execution name
    current_script = sys.argv[0]
    file_name = os.path.basename(current_script)
    assigned_name = os.path.splitext(file_name)[0]
    # create context if not already created
    if not self.writer.child_context:
        self.writer.create_context(pipeline_stage=assigned_name)
        assert self.writer.child_context is not None, f"Failed to create context for {self.writer.pipeline_name}!!"

    # create execution if not already created
    if not self.writer.execution:
        self.writer.create_execution(execution_type=assigned_name)
        assert self.writer.execution is not None, f"Failed to create execution for {self.writer.pipeline_name}!!"

    directory_path = os.path.join(self.writer.ARTIFACTS_PATH, self.writer.execution.properties["Execution_uuid"].string_value.split(',')[0], self.writer.DATASLICE_PATH)
    os.makedirs(directory_path, exist_ok=True)
    custom_props = {} if custom_properties is None else custom_properties
    git_repo = git_get_repo()
    dataslice_df = pd.DataFrame.from_dict(self.props, orient="index")
    dataslice_df.index.names = ["Path"]
    dataslice_path = os.path.join(directory_path,self.name)
    dataslice_df.to_parquet(dataslice_path)
    existing_artifact = []

    commit_output(dataslice_path, self.writer.execution.id)
    c_hash = dvc_get_hash(dataslice_path)
    if c_hash == "":
        print("Error in getting the dvc hash,return without logging")
        return

    dataslice_commit = c_hash
    url = dvc_get_url(dataslice_path)
    dvc_url_with_pipeline = f"{self.writer.parent_context.name}:{url}"
    if c_hash and c_hash.strip():
        existing_artifact.extend(
            self.writer.store.get_artifacts_by_uri(c_hash))
    if existing_artifact and len(existing_artifact) != 0:
        print("Adding to existing data slice")
        # Haven't added event type in this if cond, is it not needed??
        slice = link_execution_to_input_artifact(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            uri=c_hash,
            input_name=dataslice_path + ":" + c_hash,
        )
    else:
        slice = create_new_artifact_event_and_attribution(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            context_id=self.writer.child_context.id,
            uri=c_hash,
            name=dataslice_path + ":" + c_hash,
            type_name="Dataslice",
            event_type=mlpb.Event.Type.OUTPUT,  # type: ignore  # Event type not recognized by mypy, using ignore to bypass
            properties={
                "git_repo": str(git_repo),
                # passing c_hash value to commit
                "Commit": str(dataslice_commit),
                "url": str(dvc_url_with_pipeline),
            },
            artifact_type_properties={
                "git_repo": mlpb.STRING,    # type: ignore  # String type not recognized by mypy, using ignore to bypass
                "Commit": mlpb.STRING,  # type: ignore  # String type not recognized by mypy, using ignore to bypass
                "url": mlpb.STRING, # type: ignore  # String type not recognized by mypy, using ignore to bypass
            },
            custom_properties=custom_props,
            milliseconds_since_epoch=int(time.time() * 1000),
        )

    custom_props["git_repo"] = git_repo
    custom_props["Commit"] = dataslice_commit
    self.writer.execution_label_props["git_repo"] = git_repo
    self.writer.execution_label_props["Commit"] = dataslice_commit
    if self.writer.graph:
        self.writer.driver.create_dataslice_node(
            self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props
        )
    os.chdir(logging_dir)
    return slice