Skip to content

cmflib.cmf.Cmf.DataSlice

A data slice represents a named subset of data. It can be used to track performance of an ML model on different slices of the training or testing dataset splits. This can be useful from different perspectives, for instance, to mitigate model bias.

Instances of data slices are not meant to be created manually by users. Instead, use Cmf.create_dataslice method.

Source code in cmflib/cmf.py
1364
1365
1366
1367
def __init__(self, name: str, writer):
    self.props = {}
    self.name = name
    self.writer = writer

add_data(path, custom_properties=None)

Add data to create the dataslice. Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file should already be versioned. Example:

#dataslice.add_data(f"data/raw_data/{j}.xml)
Args: path: Name to identify the file to be added to the dataslice. custom_properties: Properties associated with this datum.

Source code in cmflib/cmf.py
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
def add_data(
    self, path: str, custom_properties: t.Optional[t.Dict] = None
) -> None:
    """Add data to create the dataslice.
    Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file
        should already be versioned.
    Example:
        ```python
        #dataslice.add_data(f"data/raw_data/{j}.xml)
        ```
    Args:
        path: Name to identify the file to be added to the dataslice.
        custom_properties: Properties associated with this datum.
    """

    self.props[path] = {}
    self.props[path]['hash'] = dvc_get_hash(path)
    parent_path = path.rsplit("/", 1)[0]
    self.data_parent = parent_path.rsplit("/", 1)[1]
    if custom_properties:
        for k, v in custom_properties.items():
            self.props[path][k] = v

commit(custom_properties=None)

Commit the dataslice. The created dataslice is versioned and added to underneath data versioning software. Example:

dataslice.commit()
```

Args: custom_properties: Dictionary to store key value pairs associated with Dataslice Example{"mean":2.5, "median":2.6}

Source code in cmflib/cmf.py
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
    """Commit the dataslice.
    The created dataslice is versioned and added to underneath data versioning software.
    Example:

        dataslice.commit()
        ```
    Args:
        custom_properties: Dictionary to store key value pairs associated with Dataslice
        Example{"mean":2.5, "median":2.6}
    """

    logging_dir = change_dir(self.writer.cmf_init_path)
    # code for nano cmf
    # Assigning current file name as stage and execution name
    current_script = sys.argv[0]
    file_name = os.path.basename(current_script)
    name_without_extension = os.path.splitext(file_name)[0]
    # create context if not already created
    if not self.writer.child_context:
        self.writer.create_context(pipeline_stage=name_without_extension)
        assert self.writer.child_context is not None, f"Failed to create context for {self.pipeline_name}!!"

    # create execution if not already created
    if not self.writer.execution:
        self.writer.create_execution(execution_type=name_without_extension)
        assert self.writer.execution is not None, f"Failed to create execution for {self.pipeline_name}!!"

    directory_path = os.path.join(self.writer.ARTIFACTS_PATH, self.writer.execution.properties["Execution_uuid"].string_value.split(',')[0], self.writer.DATASLICE_PATH)
    os.makedirs(directory_path, exist_ok=True)
    custom_props = {} if custom_properties is None else custom_properties
    git_repo = git_get_repo()
    dataslice_df = pd.DataFrame.from_dict(self.props, orient="index")
    dataslice_df.index.names = ["Path"]
    dataslice_path = os.path.join(directory_path,self.name)
    dataslice_df.to_parquet(dataslice_path)
    existing_artifact = []

    commit_output(dataslice_path, self.writer.execution.id)
    c_hash = dvc_get_hash(dataslice_path)
    if c_hash == "":
        print("Error in getting the dvc hash,return without logging")
        return

    dataslice_commit = c_hash
    url = dvc_get_url(dataslice_path)
    dvc_url_with_pipeline = f"{self.writer.parent_context.name}:{url}"
    if c_hash and c_hash.strip():
        existing_artifact.extend(
            self.writer.store.get_artifacts_by_uri(c_hash))
    if existing_artifact and len(existing_artifact) != 0:
        print("Adding to existing data slice")
        # Haven't added event type in this if cond, is it not needed??
        slice = link_execution_to_input_artifact(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            uri=c_hash,
            input_name=dataslice_path + ":" + c_hash,
        )
    else:
        slice = create_new_artifact_event_and_attribution(
            store=self.writer.store,
            execution_id=self.writer.execution.id,
            context_id=self.writer.child_context.id,
            uri=c_hash,
            name=dataslice_path + ":" + c_hash,
            type_name="Dataslice",
            event_type=mlpb.Event.Type.OUTPUT,
            properties={
                "git_repo": str(git_repo),
                # passing c_hash value to commit
                "Commit": str(dataslice_commit),
                "url": str(dvc_url_with_pipeline),
            },
            artifact_type_properties={
                "git_repo": mlpb.STRING,
                "Commit": mlpb.STRING,
                "url": mlpb.STRING,
            },
            custom_properties=custom_props,
            milliseconds_since_epoch=int(time.time() * 1000),
        )

    custom_props["git_repo"] = git_repo
    custom_props["Commit"] = dataslice_commit
    self.writer.execution_label_props["git_repo"] = git_repo
    self.writer.execution_label_props["Commit"] = dataslice_commit
    if self.writer.graph:
        self.writer.driver.create_dataslice_node(
            self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props
        )
    os.chdir(logging_dir)
    return slice