[SPARKNLP-1315] changing input data type for CamemBertForTokenClassific… (#14701)

ahmedlone127 · web-flow · commit a6ecb9fc3248 · 2025-12-01T15:01:04.000+01:00
* SPARKNLP-1315 changing input data type for CamemBertForTokenClassification from int 64 to 32

* SPARKNLP-1315 adding test for tensorflow models
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala
@@ -164,28 +164,27 @@ private[johnsnowlabs] class CamemBertClassification(
     val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max
     val batchLength = batch.length
 
-    val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength)
-    val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength)
+    val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
+    val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
 
     // [nb of encoded sentences , maxSentenceLength]
     val shape = Array(batch.length.toLong, maxSentenceLength)
 
     batch.zipWithIndex
       .foreach { case (sentence, idx) =>
-        val sentenceLong = sentence.map(x => x.toLong)
         val offset = idx * maxSentenceLength
-        tokenBuffers.offset(offset).write(sentenceLong)
+        tokenBuffers.offset(offset).write(sentence)
         maskBuffers
           .offset(offset)
-          .write(sentence.map(x => if (x == sentencePadTokenId) 0L else 1L))
+          .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1))
       }
 
     val runner = tensorflowWrapper.get
       .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false)
       .runner
 
-    val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers)
-    val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers)
+    val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers)
+    val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers)
 
     runner
       .feed(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassificationTestSpec.scala
@@ -157,4 +157,37 @@ class CamemBertForTokenClassificationTestSpec extends AnyFlatSpec {
 
     assert(totalTokens == totalTags)
   }
+
+
+  "CamemBertForTokenClassification" should "work with tensorflow models" taggedAs SlowTest in {
+
+    val tokenClassifier: CamemBertForTokenClassification = CamemBertForTokenClassification
+      .pretrained("camembert_classifier_base_wikipedia_4gb_finetuned_job_ner")
+      .setInputCols(Array("token", "document"))
+      .setOutputCol("ner")
+      .setCaseSensitive(true)
+      .setMaxSentenceLength(512)
+
+    val pipeline = new Pipeline().setStages(Array(document, tokenizer, tokenClassifier))
+
+    val pipelineModel = pipeline.fit(ddd)
+    val pipelineDF = pipelineModel.transform(ddd)
+
+    pipelineDF.select("token.result").show(false)
+    pipelineDF.select("ner.result").show(false)
+    pipelineDF
+      .withColumn("token_size", size(col("token")))
+      .withColumn("ner_size", size(col("ner")))
+      .where(col("token_size") =!= col("ner_size"))
+      .select("token_size", "ner_size", "token.result", "ner.result")
+      .show(false)
+
+    val totalTokens = pipelineDF.select(explode($"token.result")).count.toInt
+    val totalEmbeddings = pipelineDF.select(explode($"ner.result")).count.toInt
+
+    println(s"total tokens: $totalTokens")
+    println(s"total embeddings: $totalEmbeddings")
+
+  }
+
 }