Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion core/amber/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ python-lsp-server[all]==1.5.0
python-lsp-server[websockets]
bidict==0.22.0
cached_property
psutil
psutil
transformers
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import edu.uci.ics.texera.workflow.operators.intersect.IntersectOpDesc
import edu.uci.ics.texera.workflow.operators.intervalJoin.IntervalJoinOpDesc
import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc
import edu.uci.ics.texera.workflow.operators.limit.LimitOpDesc
import edu.uci.ics.texera.workflow.operators.huggingFace.HuggingFaceSentimentAnalysisOpDesc
import edu.uci.ics.texera.workflow.operators.projection.ProjectionOpDesc
import edu.uci.ics.texera.workflow.operators.randomksampling.RandomKSamplingOpDesc
import edu.uci.ics.texera.workflow.operators.regex.RegexOpDesc
Expand Down Expand Up @@ -180,7 +181,11 @@ trait StateTransferFunc
new Type(value = classOf[FunnelPlotOpDesc], name = "FunnelPlot"),
new Type(value = classOf[TablesPlotOpDesc], name = "TablesPlot"),
new Type(value = classOf[JavaUDFOpDesc], name = "JavaUDF"),
new Type(value = classOf[SortOpDesc], name = "Sort")
new Type(value = classOf[SortOpDesc], name = "Sort"),
new Type(
value = classOf[HuggingFaceSentimentAnalysisOpDesc],
name = "HuggingFaceSentimentAnalysis"
)
)
)
abstract class LogicalOp extends PortDescriptor with Serializable {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package edu.uci.ics.texera.workflow.operators.huggingFace

import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort}
import edu.uci.ics.texera.workflow.common.metadata.annotations.AutofillAttributeName
import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo}
import edu.uci.ics.texera.workflow.common.operators.PythonOperatorDescriptor
import edu.uci.ics.texera.workflow.common.tuple.schema.{AttributeType, Schema}

class HuggingFaceSentimentAnalysisOpDesc extends PythonOperatorDescriptor {
@JsonProperty(value = "attribute", required = true)
@JsonPropertyDescription("column to perform sentiment analysis on")
@AutofillAttributeName
var attribute: String = _

@JsonProperty(
value = "Positive result attribute",
required = true,
defaultValue = "huggingface_sentiment_positive"
)
@JsonPropertyDescription("column name of the sentiment analysis result (positive)")
var resultAttributePositive: String = _

@JsonProperty(
value = "Neutral result attribute",
required = true,
defaultValue = "huggingface_sentiment_neutral"
)
@JsonPropertyDescription("column name of the sentiment analysis result (neutral)")
var resultAttributeNeutral: String = _

@JsonProperty(
value = "Negative result attribute",
required = true,
defaultValue = "huggingface_sentiment_negative"
)
@JsonPropertyDescription("column name of the sentiment analysis result (negative)")
var resultAttributeNegative: String = _

override def generatePythonCode(): String = {
s"""from pytexera import *
|from transformers import pipeline
|from transformers import AutoModelForSequenceClassification
|from transformers import TFAutoModelForSequenceClassification
|from transformers import AutoTokenizer, AutoConfig
|import numpy as np
|from scipy.special import softmax
|
|class ProcessTupleOperator(UDFOperatorV2):
|
| def open(self):
| model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
| self.tokenizer = AutoTokenizer.from_pretrained(model_name)
| self.config = AutoConfig.from_pretrained(model_name)
| self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| @overrides
| def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]:
| encoded_input = self.tokenizer(tuple_["$attribute"], return_tensors='pt')
| output = self.model(**encoded_input)
| scores = softmax(output[0][0].detach().numpy())
| ranking = np.argsort(scores)[::-1]
| labels = {"positive": "$resultAttributePositive", "neutral": "$resultAttributeNeutral", "negative": "$resultAttributeNegative"}
| for i in range(scores.shape[0]):
| label = labels[self.config.id2label[ranking[i]]]
| score = scores[ranking[i]]
| tuple_[label] = np.round(float(score), 4)
| yield tuple_""".stripMargin
}

override def operatorInfo: OperatorInfo =
OperatorInfo(
"Hugging Face Sentiment Analysis",
"Analyzing Sentiments with a Twitter-Based Model from Hugging Face",
OperatorGroupConstants.MACHINE_LEARNING_GROUP,
inputPorts = List(InputPort()),
outputPorts = List(OutputPort()),
supportReconfiguration = true
)

override def getOutputSchema(schemas: Array[Schema]): Schema = {
if (
resultAttributePositive == null || resultAttributePositive.trim.isEmpty ||
resultAttributeNeutral == null || resultAttributeNeutral.trim.isEmpty ||
resultAttributeNegative == null || resultAttributeNegative.trim.isEmpty
)
return null
Schema
.builder()
.add(schemas(0))
.add(resultAttributePositive, AttributeType.DOUBLE)
.add(resultAttributeNeutral, AttributeType.DOUBLE)
.add(resultAttributeNegative, AttributeType.DOUBLE)
.build()
}
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.