Tool Permissions Prerequisites
- Go to tab Configuration --> Advanced options
under tab Spark- need to put 3 lines
spark.spline.lineageDispatcher http
spark.spline.lineageDispatcher.http.producer.url http://<IP>:8080/producer
spark.spline.mode REQUIRED (<IP> should be replaced by the IP address that Octopai will provide you) - Need to verify that we can collect data through the above IP (actions of network enable)
- The Customer should perform a restart to their Databricks cluster
- On the cluster of Databricks need to install library for spline agent
How to set up the permissions
Click on Install New
Select Maven
Select Search Packages
Note : The version of the Spline Agent should match Databricks runtime version
Click on Install button
On each Notebook needed to be analyzed by Octopai:
Create a new cell and put there the following Scala lines:
(Up to Spline version 1.0.4)
%scala
import scala.util.parsing.json.JSON
import za.co.absa.spline.harvester.SparkLineageInitializer._
import za.co.absa.spline.agent.AgentConfig
import za.co.absa.spline.harvester.postprocessing.AbstractPostProcessingFilter
import za.co.absa.spline.harvester.postprocessing.PostProcessingFilter
import org.apache.commons.configuration.Configuration
import za.co.absa.spline.harvester.conf.StandardSplineConfigurationStack
import za.co.absa.spline.harvester.HarvestingContext
import za.co.absa.spline.producer.model.ExecutionPlan
import za.co.absa.spline.producer.model.ExecutionEvent
import za.co.absa.spline.producer.model.ReadOperation
import za.co.absa.spline.producer.model.WriteOperation
import za.co.absa.spline.producer.model.DataOperation
import za.co.absa.spline.harvester.ExtraMetadataImplicits._
import za.co.absa.spline.harvester.SparkLineageInitializer._
val notebookInformationJson = dbutils.notebook.getContext.toJson
val outerMap = JSON.parseFull(notebookInformationJson).getOrElse(0).asInstanceOf[Map[String,String]]
val tagMap = outerMap("tags").asInstanceOf[Map[String,String]]
val extraContextMap = outerMap("extraContext").asInstanceOf[Map[String,String]]
val notebookPath = extraContextMap("notebook_path").split("/")
val workspaceUrl=tagMap("browserHostName")
val workspaceName=dbutils.notebook().getContext().notebookPath.get
val notebookURL = tagMap("browserHostName")+"/?o="+tagMap("orgId")+tagMap("browserHash")
val user = tagMap("user")
val name = notebookPath(notebookPath.size-1)
val notebookInfo = Map("notebookURL" -> notebookURL,
"user" -> user,
"workspaceName" ->workspaceName,
"workspaceUrl" -> workspaceUrl,
"name" -> name,
"mounts" -> dbutils.fs.ls("/FileStore/tables").map(_.path),
"timestamp" -> System.currentTimeMillis)
val notebookInfoJson = scala.util.parsing.json.JSONObject(notebookInfo)
class CustomFilter extends PostProcessingFilter {
def this(conf: Configuration) = this()
override def processExecutionEvent(event: ExecutionEvent, ctx: HarvestingContext): ExecutionEvent =
event.withAddedExtra(Map("foo" -> "bar"))
override def processExecutionPlan(plan: ExecutionPlan, ctx: HarvestingContext ): ExecutionPlan =
plan.withAddedExtra(Map( "notebookInfo" -> notebookInfoJson))
override def processReadOperation(op: ReadOperation, ctx: HarvestingContext ): ReadOperation =
op.withAddedExtra(Map("foo" -> "bar"))
override def processWriteOperation(op: WriteOperation, ctx: HarvestingContext): WriteOperation =
op.withAddedExtra(Map("foo" -> "bar"))
override def processDataOperation(op: DataOperation, ctx: HarvestingContext ): DataOperation =
op.withAddedExtra(Map("foo" -> "bar"))
}
val myInstance = new CustomFilter()
spark.enableLineageTracking(
AgentConfig.builder()
.postProcessingFilter(myInstance)
.build()
)
Add as the first line on the notebook the following line:
sc._jvm.za.co.absa.spline.harvester.SparkLineageInitializer.enableLineageTracking(spark._jsparkSession)
Run the notebooks, our engines will collect them and analyze them
Note : After initial setup for security purposes, Please provide Octopai with your public IP in order for whitelist.
Comments
1 comment
Might I request that the Spark Settings screenshot be augmented with copy/paste friendly text? The prospect of transcribing (and very probably misspelling) those settings is a bit daunting.
Please sign in to leave a comment.