1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495 | def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
"""Commit the dataslice.
The created dataslice is versioned and added to underneath data versioning software.
Example:
dataslice.commit()
```
Args:
custom_properties: Dictionary to store key value pairs associated with Dataslice
Example{"mean":2.5, "median":2.6}
"""
logging_dir = change_dir(self.writer.cmf_init_path)
# code for nano cmf
# Assigning current file name as stage and execution name
current_script = sys.argv[0]
file_name = os.path.basename(current_script)
name_without_extension = os.path.splitext(file_name)[0]
# create context if not already created
if not self.writer.child_context:
self.writer.create_context(pipeline_stage=name_without_extension)
assert self.writer.child_context is not None, f"Failed to create context for {self.pipeline_name}!!"
# create execution if not already created
if not self.writer.execution:
self.writer.create_execution(execution_type=name_without_extension)
assert self.writer.execution is not None, f"Failed to create execution for {self.pipeline_name}!!"
directory_path = os.path.join(self.writer.ARTIFACTS_PATH, self.writer.execution.properties["Execution_uuid"].string_value.split(',')[0], self.writer.DATASLICE_PATH)
os.makedirs(directory_path, exist_ok=True)
custom_props = {} if custom_properties is None else custom_properties
git_repo = git_get_repo()
dataslice_df = pd.DataFrame.from_dict(self.props, orient="index")
dataslice_df.index.names = ["Path"]
dataslice_path = os.path.join(directory_path,self.name)
dataslice_df.to_parquet(dataslice_path)
existing_artifact = []
commit_output(dataslice_path, self.writer.execution.id)
c_hash = dvc_get_hash(dataslice_path)
if c_hash == "":
print("Error in getting the dvc hash,return without logging")
return
dataslice_commit = c_hash
url = dvc_get_url(dataslice_path)
dvc_url_with_pipeline = f"{self.writer.parent_context.name}:{url}"
if c_hash and c_hash.strip():
existing_artifact.extend(
self.writer.store.get_artifacts_by_uri(c_hash))
if existing_artifact and len(existing_artifact) != 0:
print("Adding to existing data slice")
# Haven't added event type in this if cond, is it not needed??
slice = link_execution_to_input_artifact(
store=self.writer.store,
execution_id=self.writer.execution.id,
uri=c_hash,
input_name=dataslice_path + ":" + c_hash,
)
else:
slice = create_new_artifact_event_and_attribution(
store=self.writer.store,
execution_id=self.writer.execution.id,
context_id=self.writer.child_context.id,
uri=c_hash,
name=dataslice_path + ":" + c_hash,
type_name="Dataslice",
event_type=mlpb.Event.Type.OUTPUT,
properties={
"git_repo": str(git_repo),
# passing c_hash value to commit
"Commit": str(dataslice_commit),
"url": str(dvc_url_with_pipeline),
},
artifact_type_properties={
"git_repo": mlpb.STRING,
"Commit": mlpb.STRING,
"url": mlpb.STRING,
},
custom_properties=custom_props,
milliseconds_since_epoch=int(time.time() * 1000),
)
custom_props["git_repo"] = git_repo
custom_props["Commit"] = dataslice_commit
self.writer.execution_label_props["git_repo"] = git_repo
self.writer.execution_label_props["Commit"] = dataslice_commit
if self.writer.graph:
self.writer.driver.create_dataslice_node(
self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props
)
os.chdir(logging_dir)
return slice
|