1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647 | def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
"""Commit the dataslice.
The created dataslice is versioned and added to underneath data versioning software.
Example:
dataslice.commit()
```
Args:
custom_properties: Dictionary to store key value pairs associated with Dataslice
Example{"mean":2.5, "median":2.6}
"""
logging_dir = change_dir(self.writer.cmf_init_path)
# code for nano cmf
# Assigning current file name as stage and execution name
current_script = sys.argv[0]
file_name = os.path.basename(current_script)
assigned_name = os.path.splitext(file_name)[0]
# create context if not already created
if not self.writer.child_context:
self.writer.create_context(pipeline_stage=assigned_name)
assert self.writer.child_context is not None, f"Failed to create context for {self.writer.pipeline_name}!!"
# create execution if not already created
if not self.writer.execution:
self.writer.create_execution(execution_type=assigned_name)
assert self.writer.execution is not None, f"Failed to create execution for {self.writer.pipeline_name}!!"
directory_path = os.path.join(self.writer.ARTIFACTS_PATH, self.writer.execution.properties["Execution_uuid"].string_value.split(',')[0], self.writer.DATASLICE_PATH)
os.makedirs(directory_path, exist_ok=True)
custom_props = {} if custom_properties is None else custom_properties
git_repo = git_get_repo()
dataslice_df = pd.DataFrame.from_dict(self.props, orient="index")
dataslice_df.index.names = ["Path"]
dataslice_path = os.path.join(directory_path,self.name)
dataslice_df.to_parquet(dataslice_path)
existing_artifact = []
commit_output(dataslice_path, self.writer.execution.id)
c_hash = dvc_get_hash(dataslice_path)
if c_hash == "":
print("Error in getting the dvc hash,return without logging")
return
dataslice_commit = c_hash
url = dvc_get_url(dataslice_path)
dvc_url_with_pipeline = f"{self.writer.parent_context.name}:{url}"
if c_hash and c_hash.strip():
existing_artifact.extend(
self.writer.store.get_artifacts_by_uri(c_hash))
if existing_artifact and len(existing_artifact) != 0:
print("Adding to existing data slice")
# Haven't added event type in this if cond, is it not needed??
slice = link_execution_to_input_artifact(
store=self.writer.store,
execution_id=self.writer.execution.id,
uri=c_hash,
input_name=dataslice_path + ":" + c_hash,
)
else:
slice = create_new_artifact_event_and_attribution(
store=self.writer.store,
execution_id=self.writer.execution.id,
context_id=self.writer.child_context.id,
uri=c_hash,
name=dataslice_path + ":" + c_hash,
type_name="Dataslice",
event_type=mlpb.Event.Type.OUTPUT, # type: ignore # Event type not recognized by mypy, using ignore to bypass
properties={
"git_repo": str(git_repo),
# passing c_hash value to commit
"Commit": str(dataslice_commit),
"url": str(dvc_url_with_pipeline),
},
artifact_type_properties={
"git_repo": mlpb.STRING, # type: ignore # String type not recognized by mypy, using ignore to bypass
"Commit": mlpb.STRING, # type: ignore # String type not recognized by mypy, using ignore to bypass
"url": mlpb.STRING, # type: ignore # String type not recognized by mypy, using ignore to bypass
},
custom_properties=custom_props,
milliseconds_since_epoch=int(time.time() * 1000),
)
custom_props["git_repo"] = git_repo
custom_props["Commit"] = dataslice_commit
self.writer.execution_label_props["git_repo"] = git_repo
self.writer.execution_label_props["Commit"] = dataslice_commit
if self.writer.graph:
self.writer.driver.create_dataslice_node(
self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props
)
os.chdir(logging_dir)
return slice
|