|  | 
|  | 1 | +/* | 
|  | 2 | + * Licensed to the Apache Software Foundation (ASF) under one | 
|  | 3 | + * or more contributor license agreements.  See the NOTICE file | 
|  | 4 | + * distributed with this work for additional information | 
|  | 5 | + * regarding copyright ownership.  The ASF licenses this file | 
|  | 6 | + * to you under the Apache License, Version 2.0 (the | 
|  | 7 | + * "License"); you may not use this file except in compliance | 
|  | 8 | + * with the License.  You may obtain a copy of the License at | 
|  | 9 | + * | 
|  | 10 | + *   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | + * | 
|  | 12 | + * Unless required by applicable law or agreed to in writing, | 
|  | 13 | + * software distributed under the License is distributed on an | 
|  | 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | 15 | + * KIND, either express or implied.  See the License for the | 
|  | 16 | + * specific language governing permissions and limitations | 
|  | 17 | + * under the License. | 
|  | 18 | + */ | 
|  | 19 | + | 
|  | 20 | +package org.apache.spark.sql.comet | 
|  | 21 | + | 
|  | 22 | +import scala.collection.JavaConverters._ | 
|  | 23 | + | 
|  | 24 | +import org.apache.arrow.memory.RootAllocator | 
|  | 25 | +import org.apache.arrow.vector.complex.MapVector | 
|  | 26 | +import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision, IntervalUnit, TimeUnit} | 
|  | 27 | +import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} | 
|  | 28 | +import org.apache.spark.sql.internal.SQLConf | 
|  | 29 | +import org.apache.spark.sql.types._ | 
|  | 30 | + | 
|  | 31 | +object CometArrowUtils { | 
|  | 32 | + | 
|  | 33 | +  val rootAllocator = new RootAllocator(Long.MaxValue) | 
|  | 34 | + | 
|  | 35 | +  // todo: support more types. | 
|  | 36 | + | 
|  | 37 | +  /** Maps data type from Spark to Arrow. NOTE: timeZoneId required for TimestampTypes */ | 
|  | 38 | +  def toArrowType(dt: DataType, timeZoneId: String): ArrowType = dt match { | 
|  | 39 | +    case BooleanType => ArrowType.Bool.INSTANCE | 
|  | 40 | +    case ByteType => new ArrowType.Int(8, true) | 
|  | 41 | +    case ShortType => new ArrowType.Int(8 * 2, true) | 
|  | 42 | +    case IntegerType => new ArrowType.Int(8 * 4, true) | 
|  | 43 | +    case LongType => new ArrowType.Int(8 * 8, true) | 
|  | 44 | +    case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) | 
|  | 45 | +    case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE) | 
|  | 46 | +    case StringType => ArrowType.Utf8.INSTANCE | 
|  | 47 | +    case BinaryType => ArrowType.Binary.INSTANCE | 
|  | 48 | +    case DecimalType.Fixed(precision, scale) => new ArrowType.Decimal(precision, scale) | 
|  | 49 | +    case DateType => new ArrowType.Date(DateUnit.DAY) | 
|  | 50 | +    case TimestampType if timeZoneId == null => | 
|  | 51 | +      throw new IllegalStateException("Missing timezoneId where it is mandatory.") | 
|  | 52 | +    case TimestampType => new ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZoneId) | 
|  | 53 | +    case TimestampNTZType => | 
|  | 54 | +      new ArrowType.Timestamp(TimeUnit.MICROSECOND, null) | 
|  | 55 | +    case NullType => ArrowType.Null.INSTANCE | 
|  | 56 | +    case _: YearMonthIntervalType => new ArrowType.Interval(IntervalUnit.YEAR_MONTH) | 
|  | 57 | +    case _: DayTimeIntervalType => new ArrowType.Duration(TimeUnit.MICROSECOND) | 
|  | 58 | +    case _ => | 
|  | 59 | +      throw new IllegalArgumentException() | 
|  | 60 | +  } | 
|  | 61 | + | 
|  | 62 | +  def fromArrowType(dt: ArrowType): DataType = dt match { | 
|  | 63 | +    case ArrowType.Bool.INSTANCE => BooleanType | 
|  | 64 | +    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 => ByteType | 
|  | 65 | +    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 2 => ShortType | 
|  | 66 | +    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 4 => IntegerType | 
|  | 67 | +    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 8 => LongType | 
|  | 68 | +    case float: ArrowType.FloatingPoint | 
|  | 69 | +        if float.getPrecision() == FloatingPointPrecision.SINGLE => | 
|  | 70 | +      FloatType | 
|  | 71 | +    case float: ArrowType.FloatingPoint | 
|  | 72 | +        if float.getPrecision() == FloatingPointPrecision.DOUBLE => | 
|  | 73 | +      DoubleType | 
|  | 74 | +    case ArrowType.Utf8.INSTANCE => StringType | 
|  | 75 | +    case ArrowType.Binary.INSTANCE => BinaryType | 
|  | 76 | +    case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale) | 
|  | 77 | +    case date: ArrowType.Date if date.getUnit == DateUnit.DAY => DateType | 
|  | 78 | +    case ts: ArrowType.Timestamp | 
|  | 79 | +        if ts.getUnit == TimeUnit.MICROSECOND && ts.getTimezone == null => | 
|  | 80 | +      TimestampNTZType | 
|  | 81 | +    case ts: ArrowType.Timestamp if ts.getUnit == TimeUnit.MICROSECOND => TimestampType | 
|  | 82 | +    case ArrowType.Null.INSTANCE => NullType | 
|  | 83 | +    case yi: ArrowType.Interval if yi.getUnit == IntervalUnit.YEAR_MONTH => | 
|  | 84 | +      YearMonthIntervalType() | 
|  | 85 | +    case di: ArrowType.Duration if di.getUnit == TimeUnit.MICROSECOND => DayTimeIntervalType() | 
|  | 86 | +    case _ => throw new IllegalArgumentException() | 
|  | 87 | +    // throw QueryExecutionErrors.unsupportedArrowTypeError(dt) | 
|  | 88 | +  } | 
|  | 89 | + | 
|  | 90 | +  /** Maps field from Spark to Arrow. NOTE: timeZoneId required for TimestampType */ | 
|  | 91 | +  def toArrowField(name: String, dt: DataType, nullable: Boolean, timeZoneId: String): Field = { | 
|  | 92 | +    dt match { | 
|  | 93 | +      case ArrayType(elementType, containsNull) => | 
|  | 94 | +        val fieldType = new FieldType(nullable, ArrowType.List.INSTANCE, null) | 
|  | 95 | +        new Field( | 
|  | 96 | +          name, | 
|  | 97 | +          fieldType, | 
|  | 98 | +          Seq(toArrowField("element", elementType, containsNull, timeZoneId)).asJava) | 
|  | 99 | +      case StructType(fields) => | 
|  | 100 | +        val fieldType = new FieldType(nullable, ArrowType.Struct.INSTANCE, null) | 
|  | 101 | +        new Field( | 
|  | 102 | +          name, | 
|  | 103 | +          fieldType, | 
|  | 104 | +          fields | 
|  | 105 | +            .map { field => | 
|  | 106 | +              toArrowField(field.name, field.dataType, field.nullable, timeZoneId) | 
|  | 107 | +            } | 
|  | 108 | +            .toSeq | 
|  | 109 | +            .asJava) | 
|  | 110 | +      case MapType(keyType, valueType, valueContainsNull) => | 
|  | 111 | +        val mapType = new FieldType(nullable, new ArrowType.Map(false), null) | 
|  | 112 | +        // Note: Map Type struct can not be null, Struct Type key field can not be null | 
|  | 113 | +        new Field( | 
|  | 114 | +          name, | 
|  | 115 | +          mapType, | 
|  | 116 | +          Seq( | 
|  | 117 | +            toArrowField( | 
|  | 118 | +              MapVector.DATA_VECTOR_NAME, | 
|  | 119 | +              new StructType() | 
|  | 120 | +                .add(MapVector.KEY_NAME, keyType, nullable = false) | 
|  | 121 | +                .add(MapVector.VALUE_NAME, valueType, nullable = valueContainsNull), | 
|  | 122 | +              nullable = false, | 
|  | 123 | +              timeZoneId)).asJava) | 
|  | 124 | +      case udt: UserDefinedType[_] => toArrowField(name, udt.sqlType, nullable, timeZoneId) | 
|  | 125 | +      case dataType => | 
|  | 126 | +        val fieldType = new FieldType(nullable, toArrowType(dataType, timeZoneId), null) | 
|  | 127 | +        new Field(name, fieldType, Seq.empty[Field].asJava) | 
|  | 128 | +    } | 
|  | 129 | +  } | 
|  | 130 | + | 
|  | 131 | +  def fromArrowField(field: Field): DataType = { | 
|  | 132 | +    field.getType match { | 
|  | 133 | +      case _: ArrowType.Map => | 
|  | 134 | +        val elementField = field.getChildren.get(0) | 
|  | 135 | +        val keyType = fromArrowField(elementField.getChildren.get(0)) | 
|  | 136 | +        val valueType = fromArrowField(elementField.getChildren.get(1)) | 
|  | 137 | +        MapType(keyType, valueType, elementField.getChildren.get(1).isNullable) | 
|  | 138 | +      case ArrowType.List.INSTANCE => | 
|  | 139 | +        val elementField = field.getChildren().get(0) | 
|  | 140 | +        val elementType = fromArrowField(elementField) | 
|  | 141 | +        ArrayType(elementType, containsNull = elementField.isNullable) | 
|  | 142 | +      case ArrowType.Struct.INSTANCE => | 
|  | 143 | +        val fields = field.getChildren().asScala.map { child => | 
|  | 144 | +          val dt = fromArrowField(child) | 
|  | 145 | +          StructField(child.getName, dt, child.isNullable) | 
|  | 146 | +        } | 
|  | 147 | +        StructType(fields.toArray) | 
|  | 148 | +      case arrowType => fromArrowType(arrowType) | 
|  | 149 | +    } | 
|  | 150 | +  } | 
|  | 151 | + | 
|  | 152 | +  /** | 
|  | 153 | +   * Maps schema from Spark to Arrow. NOTE: timeZoneId required for TimestampType in StructType | 
|  | 154 | +   */ | 
|  | 155 | +  def toArrowSchema(schema: StructType, timeZoneId: String): Schema = { | 
|  | 156 | +    new Schema(schema.map { field => | 
|  | 157 | +      toArrowField(field.name, field.dataType, field.nullable, timeZoneId) | 
|  | 158 | +    }.asJava) | 
|  | 159 | +  } | 
|  | 160 | + | 
|  | 161 | +  def fromArrowSchema(schema: Schema): StructType = { | 
|  | 162 | +    StructType(schema.getFields.asScala.map { field => | 
|  | 163 | +      val dt = fromArrowField(field) | 
|  | 164 | +      StructField(field.getName, dt, field.isNullable) | 
|  | 165 | +    }.toArray) | 
|  | 166 | +  } | 
|  | 167 | + | 
|  | 168 | +  /** Return Map with conf settings to be used in ArrowPythonRunner */ | 
|  | 169 | +  def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = { | 
|  | 170 | +    val timeZoneConf = Seq(SQLConf.SESSION_LOCAL_TIMEZONE.key -> conf.sessionLocalTimeZone) | 
|  | 171 | +    val pandasColsByName = Seq( | 
|  | 172 | +      SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key -> | 
|  | 173 | +        conf.pandasGroupedMapAssignColumnsByName.toString) | 
|  | 174 | +    val arrowSafeTypeCheck = Seq( | 
|  | 175 | +      SQLConf.PANDAS_ARROW_SAFE_TYPE_CONVERSION.key -> | 
|  | 176 | +        conf.arrowSafeTypeConversion.toString) | 
|  | 177 | +    Map(timeZoneConf ++ pandasColsByName ++ arrowSafeTypeCheck: _*) | 
|  | 178 | +  } | 
|  | 179 | + | 
|  | 180 | +} | 
0 commit comments