diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index ce790653..9ab17e5d 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -4641,6 +4641,268 @@ public static Column format_number(Column x, Integer d) { return new Column(com.snowflake.snowpark.functions.format_number(x.toScalaColumn(), d)); } + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + *

NOTE: + * + *

Timestamp type: there is no interpretation of date values as UTC Identifiers with spaces: + * Snowflake returns error when an invalid expression is sent. + * + *

Usage: + * + *

{@code
+   * {
+   *   df = session.createDataFrame(Seq(("CR", "{\"id\": 5,
+   *             \"name\": \"Jose\", \"age\": 29}")))
+   *               .toDF(Seq("nationality", "json_string"))
+   * }
+   * When the result of this function is the only part of
+   * the select statement, no changes are needed
+   * df.select(json_tuple(col("json_string"), "id", "name", "age")).show()
+   * ----------------------
+   * |"C0"  |"C1"  |"C2"  |
+   * ----------------------
+   * |5     |Jose  |29    |
+   * ----------------------
+   *
+   * However, when specifying multiple columns, an expression like this is required:
+   *
+   * df.select(
+   *   col("nationality")
+   *   , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
+   * ).show()
+   *
+   *
+   *
+   * -------------------------------------------------
+   * |"NATIONALITY"  |"C0"  |"C1"  |"C2"  |"C3"      |
+   * -------------------------------------------------
+   * |CR             |5     |Jose  |29    |Mobilize  |
+   * -------------------------------------------------
+   * }
+ * + * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields as string to pull from the JSON file. + * @return seqToList[] sequence with the specified strings. + */ + public static List json_tuple(Column json, String... fields) { + int i = -1; + java.util.ArrayList result = + new java.util.ArrayList(); + for (int j = 0; j < fields.length; j++) { + i = i + 1; + result.add(Functions.callUDF("JSON_EXTRACT_PATH_TEXT", json, col(fields[j])).as("c" + i)); + } + return result; + } + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + *

NOTE: + * + *

Timestamp type: there is no interpretation of date values as UTC Identifiers with spaces: + * Snowflake returns error when an invalid expression is sent. + * + *

Usage: + * + *

{@code
+   * {
+   *   df = session.createDataFrame(Seq(("CR", "{\"id\": 5,
+   *             \"name\": \"Jose\", \"age\": 29}")))
+   *               .toDF(Seq("nationality", "json_string"))
+   * }
+   * When the result of this function is the only part of
+   * the select statement, no changes are needed
+   * df.select(json_tuple(col("json_string"), "id", "name", "age")).show()
+   * ----------------------
+   * |"C0"  |"C1"  |"C2"  |
+   * ----------------------
+   * |5     |Jose  |29    |
+   * ----------------------
+   *
+   * However, when specifying multiple columns, an expression like this is required:
+   *
+   * df.select(
+   *   col("nationality")
+   *   , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
+   * ).show()
+   *
+   *
+   *
+   * -------------------------------------------------
+   * |"NATIONALITY"  |"C0"  |"C1"  |"C2"  |"C3"      |
+   * -------------------------------------------------
+   * |CR             |5     |Jose  |29    |Mobilize  |
+   * -------------------------------------------------
+   * }
+ * + * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields as column to pull from the JSON file. + * @return seqToList[] sequence with the specified strings. + */ + public static List json_tuple(Column json, Column... fields) { + int i = -1; + java.util.ArrayList result = + new java.util.ArrayList(); + for (int j = 0; j < fields.length; j++) { + i = i + 1; + result.add(Functions.callUDF("JSON_EXTRACT_PATH_TEXT", json, fields[j]).as("c" + i)); + } + return result; + } + + /** + * Used to calculate the cubic root of a number. + * + *

Example + * + *

{@code
+   * SELECT x, cbrt(x) FROM tab;
+   *
+   * --------+-------------+
+   * x    |   cbrt(x)   |
+   * --------+-------------+
+   * 0      | 0           |
+   * 2      | 1.25992105  |
+   * -10    | -2.15443469 |
+   * [NULL] | [NULL]      |
+   * --------+-------------+
+   * }
+ * + * @since 1.15.0 + * @param x Column to calculate the cubic root. + * @return Column object. + */ + public static Column cbrt(Column x) { + return new Column(com.snowflake.snowpark.functions.cbrt(x.toScalaColumn())); + } + + /** + * Used to calculate the cubic root of a number. There were slight differences found: + * + *

Example + * + *

{@code
+   * SELECT x, cbrt(x) FROM tab;
+   *
+   * --------+-------------+
+   * x    |   cbrt(x)   |
+   * --------+-------------+
+   * 0      | 0           |
+   * 2      | 1.25992105  |
+   * -10    | -2.15443469 |
+   * [NULL] | [NULL]      |
+   * --------+-------------+
+   * }
+ * + * @since 1.15.0 + * @param columnName as a stringto calculate the cubic root. + * @return Column object. + */ + public static Column cbrt(String columnName) { + return new Column(functions.cbrt(columnName)); + } + + /** + * This function converts a JSON string to a variant in Snowflake. + * + *

In Snowflake the values are converted automatically, however they're converted as variants, + * meaning that the printSchema function would return different datatypes. To convert the datatype + * and it to be printed as the expected datatype, it should be read on the + * + *

Example + * + *

{@code
+   * selectExpr function as "json['relative']['age']::integer"
+   * val data_for_json = Seq(
+   *   (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}")
+   *   (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}")
+   * )
+   * val data_for_json_column = Seq("col1", "col2")
+   * val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column)
+   *
+   * val json_df = df_for_json.select(
+   *   from_json(col("col2")).as("json")
+   * )
+   *
+   * json_df.selectExpr(
+   *   "json['id']::integer as id"
+   *   , "json['age']::integer as age"
+   *   , "json['relative']['id']::integer as rel_id"
+   *   , "json['relative']['age']::integer as rel_age"
+   * ).show(10, 10000)
+   * -----------------------------------------
+   * |"ID"    |"AGE"  |"REL_ID"  |"REL_AGE"  |
+   * -----------------------------------------
+   * |172319  |41     |885471    |29         |
+   * |532161  |17     |873513    |47         |
+   * -----------------------------------------
+   * }
+ * + * @since 1.15.0 + * @param e String column to convert to variant. + * @return Column object. + */ + public static Column from_json(Column e) { + return new Column(functions.from_json(e.toScalaColumn())); + } + /** + * Returns the value of sourceExpr cast to data type targetType if possible, or NULL if not + * possible. + * + *

Example:: + * + *

{@code
+   * df = session.create_dataframe(['0.12', 'USD 27.90',
+   * '13.97 USD', '97.0', '17,-'], schema=["a"])
+   * df.select(try_cast(col("a"), FloatType()).as_('ans')).collect()
+   * [Row(ANS=0.12), Row(ANS=None), Row(ANS=None), Row(ANS=None), Row(ANS=None)]
+   *
+   * }
+ * + * @since 1.15.0 + * @param e Any castable expression + * @param targetType The type of the result + * @return The result is of type targetType. special version of CAST for a subset of datatype + * conversions. It performs the same operation (i.e. converts a value of one data type into + * another data type), but returns a NULL value instead of raising an error when the + * conversion can not be performed. The column argument must be a string column in Snowflake. + */ + public static Column try_cast(Column e, DataType targetType) { + return e.cast(targetType); + } + /** + * This function receives a date or timestamp, as well as a properly formatted string and + * subtracts the specified amount of days from it. If receiving a string, this string is casted to + * date using try_cast and if it's not possible to cast, returns null. If receiving a timestamp it + * will be casted to date (removing its time). Example:: + * + *
{@code
+   * from snowflake.snowpark.functions import date_sub, to_date
+   *  df = session.createDataFrame([("1976-01-06")], ["date"])
+   *  df = df.withColumn("date",to_date("date"))
+   *  df.withColumn("date", date_sub("date", 2)).show()
+   * --------------
+   * |"DATE" |
+   * --------------
+   * |1976-01-04 |
+   * -------------- """
+   *
+   * }
+ * + * @since 1.15.0 + * @param start Date, Timestamp or String column to subtract days from. + * @param days Days to subtract. + * @return Column object. + */ + public static Column date_sub(Column start, Integer days) { + return new Column(functions.date_sub(start.toScalaColumn(), days)); + } /* Returns a Column expression with values sorted in descending order. * *

Example: order column values in descending diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 264b9ffe..81db9a50 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3887,6 +3887,256 @@ object functions { builtin("TO_VARCHAR")(x, if (d > 0) s"999,999.${"0" * d}" else "999,999") } } + + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + * NOTE: + *

    + *
  • Timestamp type: there is no interpretation of date values as UTC
  • + *
  • Identifiers with spaces: Snowflake returns error when an invalid expression is sent.
  • + * + * Usage: + * { + * df = session.createDataFrame(Seq(("CR", "{\"id\": 5, + * \"name\": \"Jose\", \"age\": 29}"))) + * .toDF(Seq("nationality", "json_string")) + * } + * When the result of this function is the only part of + * the select statement, no changes are needed + * df.select(json_tuple(col("json_string"), "id", "name", "age")).show() + * + *
    +   * ----------------------
    +   * |"C0"  |"C1"  |"C2"  |
    +   * ----------------------
    +   * |5     |Jose  |29    |
    +   * ----------------------
    +   * 
    + * However, when specifying multiple columns, an expression like this is required: + *
    +   * df.select(
    +   *   col("nationality")
    +   *   , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
    +   * ).show()
    +   * 
    + * + *
    +   * -------------------------------------------------
    +   * |"NATIONALITY"  |"C0"  |"C1"  |"C2"  |"C3"      |
    +   * -------------------------------------------------
    +   * |CR             |5     |Jose  |29    |Mobilize  |
    +   * -------------------------------------------------
    +   * 
    + * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields to pull from the JSON file. + * @return Column sequence with the specified strings. + */ + def json_tuple(json: Column, fields: Column*): Seq[Column] = { + var i = -1 + fields.map(f => { + i += 1 + builtin("JSON_EXTRACT_PATH_TEXT")(json, f).as(s"c$i") + }) + } + + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + * NOTE: + *
      + *
    • Timestamp type: there is no interpretation of date values as UTC
    • + *
    • Identifiers with spaces: Snowflake returns error when an invalid expression is sent.
    • + * + * Usage: + * { + * df = session.createDataFrame(Seq(("CR", "{\"id\": 5, + * \"name\": \"Jose\", \"age\": 29}"))) + * .toDF(Seq("nationality", "json_string")) + * } + * When the result of this function is the only part of + * the select statement, no changes are needed + * df.select(json_tuple(col("json_string"), "id", "name", "age")).show() + * + *
      +   * ----------------------
      +   * |"C0"  |"C1"  |"C2"  |
      +   * ----------------------
      +   * |5     |Jose  |29    |
      +   * ----------------------
      +   * 
      + * However, when specifying multiple columns, an expression like this is required: + *
      +   * df.select(
      +   *   col("nationality")
      +   *   , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
      +   * ).show()
      +   * 
      + * + *
      +   * -------------------------------------------------
      +   * |"NATIONALITY"  |"C0"  |"C1"  |"C2"  |"C3"      |
      +   * -------------------------------------------------
      +   * |CR             |5     |Jose  |29    |Mobilize  |
      +   * -------------------------------------------------
      +   * 
      + * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields to pull from the JSON file. + * @return Column sequence with the specified strings. + */ + def json_tuple(json: String, fields: String*): Seq[Column] = { + var i = -1 + fields.map(f => { + i += 1 + builtin("JSON_EXTRACT_PATH_TEXT")(Column(json), Column(f)).as(s"c$i") + }) + } + + /** + * Used to calculate the cubic root of a number. + * Example + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * + * @since 1.15.0 + * @param column Column to calculate the cubic root. + * @return Column object. + */ + def cbrt(e: Column): Column = { + builtin("CBRT")(e) + } + + /** + * Used to calculate the cubic root of a number. There were slight differences found: + * Example + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * + * @since 1.15.0 + * @param column Column to calculate the cubic root. + * @return Column object. + */ + def cbrt(columnName: String): Column = { + cbrt(col(columnName)) + } + + /** + * This function converts a JSON string to a variant in Snowflake. + * + * In Snowflake the values are converted automatically, however they're converted as variants, + * meaning that the printSchema + * function would return different datatypes. + * To convert the datatype and it to be printed as the expected datatype, + * it should be read on the + * selectExpr function as "json['relative']['age']::integer" + * val data_for_json = Seq( + * (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}") + * (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}") + * ) + * val data_for_json_column = Seq("col1", "col2") + * val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column) + * + * val json_df = df_for_json.select( + * from_json(col("col2")).as("json") + * ) + * + * json_df.selectExpr( + * "json['id']::integer as id" + * , "json['age']::integer as age" + * , "json['relative']['id']::integer as rel_id" + * , "json['relative']['age']::integer as rel_age" + * ).show(10, 10000) + * + * + *
      +   * -----------------------------------------
      +   * |"ID"    |"AGE"  |"REL_ID"  |"REL_AGE"  |
      +   * -----------------------------------------
      +   * |172319  |41     |885471    |29         |
      +   * |532161  |17     |873513    |47         |
      +   * -----------------------------------------
      +   * 
      + * @since 1.15.0 + * @param e String column to convert to variant. + * @return Column object. + */ + def from_json(e: Column): Column = { + builtin("TRY_PARSE_JSON")(e) + } + + /** + * Returns the value of sourceExpr cast to data type + * targetType if possible, or NULL if not possible. + * Example:: + * + * df = session.create_dataframe(['0.12', 'USD 27.90', + * '13.97 USD', '97.0', '17,-'], schema=["a"]) + * df.select(try_cast(col("a"), FloatType()).as_('ans')).collect() + * [Row(ANS=0.12), Row(ANS=None), Row(ANS=None), Row(ANS=None), Row(ANS=None)] + * @since 1.15.0 + * @param source Any castable expression + * @param Target The type of the result + * @return The result is of type targetType. + * special version of CAST for a subset of datatype conversions. + * It performs the same operation + * (i.e. converts a value of one data type into another data type), + * but returns a NULL value instead of raising an error + * when the conversion can not be performed. + * The column argument must be a string column in Snowflake. + */ + def try_cast(e: Column, targetType: DataType): Column = { + e.cast(targetType) + } + + /** + * This function receives a date or timestamp, as well as a + * properly formatted string and subtracts the specified + * amount of days from it. If receiving a string, this string is + * casted to date using try_cast and if it's not possible to cast, + * returns null. If receiving + * a timestamp it will be casted to date (removing its time). + * Example:: + * + * >>> from snowflake.snowpark.functions import date_sub, to_date + * >>> df = session.createDataFrame([("1976-01-06")], ["date"]) + * >>> df = df.withColumn("date", to_date("date")) + * >>> df.withColumn("date", date_sub("date", 2)).show() + * -------------- + * |"DATE" | + * -------------- + * |1976-01-04 | + * -------------- + * """ + * + * @since 1.15.0 + * @param start Date, Timestamp or String column to subtract days from. + * @param days Days to subtract. + * @return Column object. + */ + def date_sub(start: Column, days: Int): Column = { + dateadd("DAY", lit(days * -1), try_cast(start, DateType)) + } + /* Returns a Column expression with values sorted in descending order. * Example: * {{{ diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index 89e9cbb5..5de58b3d 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -1,5 +1,7 @@ package com.snowflake.snowpark_test; +import static com.snowflake.snowpark_java.types.DataTypes.DateType; + import com.snowflake.snowpark_java.*; import java.sql.Date; import java.sql.Time; @@ -3186,4 +3188,81 @@ public void to_utc_timestamp() { Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))}; checkAnswer(df.select(Functions.to_utc_timestamp(df.col("a"))), expected, false); } + + @Test + public void json_tuple1() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v, column2 as k from values ('{\"a\": null}','a'), " + + "('{\"a\": \"foo\"}','a'), ('{\"a\": \"foo\"}','b'), (null,'a')"); + df.show(); + DataFrame jsontupleDF = + df.select( + (Functions.json_tuple(Functions.col("v"), Functions.col("k")).toArray(new Column[0]))); + Row[] expected = { + Row.create((Object) null), + Row.create("foo"), + Row.create((Object) null), + Row.create((Object) null) + }; + checkAnswer(jsontupleDF, expected, false); + } + + @Test + public void json_tuple2() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')"); + + DataFrame jsontupleDF = + df.select( + (Functions.json_tuple(Functions.col("v"), Functions.col("id"), Functions.col("name")) + .toArray(new Column[0]))); + jsontupleDF.show(); + Row[] expected = {Row.create(("5"), ("Jose"))}; + checkAnswer(jsontupleDF, expected, false); + } + + @Test + public void try_cast() { + DataFrame df = getSession().sql("select * from values('2024-04-05') as t(a)"); + Row[] expected = {Row.create(Date.valueOf("2024-04-05"))}; + checkAnswer(df.select(Functions.try_cast(df.col("a"), DateType)), expected, false); + } + + @Test + public void date_sub() { + DataFrame df = + getSession() + .sql( + "select * from values('2020-05-01 13:11:20.000' :: timestamp)," + + "('2020-08-21 01:30:05.000' :: timestamp) as T(a)"); + Row[] expected = { + Row.create(Date.valueOf("2020-04-30")), Row.create(Date.valueOf("2020-08-20")) + }; + checkAnswer(df.select(Functions.date_sub(df.col("a"), 1)), expected, false); + } + + @Test + public void from_json() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')"); + + DataFrame jsonDF = df.select((Functions.from_json(Functions.col("v")))); + jsonDF.show(); + } + + @Test + public void cbrt() { + DataFrame df = getSession().sql("select column1 from values ( '5'),('1')"); + + Row[] expected = {Row.create((1.7099759466766968)), Row.create((1.0))}; + checkAnswer(df.select(Functions.cbrt(df.col("column1"))), expected, false); + } } diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index 8af28666..c350ca9d 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2519,6 +2519,53 @@ trait FunctionSuite extends TestData { checkAnswer(data.select(to_utc_timestamp(col("a"))), expected, sort = false) } + test("cbrt") { + checkAnswer( + testData1.select(cbrt(col("NUM"))), + Seq(Row(1.0), Row(1.25992104989)), + sort = false) + } + test("from_json") { + val data_for_json = Seq( + (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}"), + (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}")) + val data_for_json_column = Seq("col1", "col2") + val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column) + val json_df = df_for_json.select(from_json(col("col2")).as("json")).show() + + } + + test("json_tuple") { + val json_tuple = builtin("JSON_EXTRACT_PATH_TEXT") + checkAnswer( + validJson1.select(json_tuple(col("v"), col("K"))), + Seq(Row(null), Row("foo"), Row(null), Row(null)), + sort = false) + } + + test("json_tuple1") { + val df = session.sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')") + df.show() + + checkAnswer( + df.select(json_tuple(col("v"), col("id"), col("name"))), + Seq(Row(("5"), ("Jose"))), + sort = false) + } + test("try_cast") { + val df = Seq("1", "2").toDF("a") + checkAnswer(df.select(try_cast(col("a"), IntegerType)), Seq(1, 2), sort = false) + } + test("date_sub") { + var expected = Seq(Date.valueOf("2020-04-30"), Date.valueOf("2020-08-20")).toDF("b") + checkAnswer( + timestamp1 + .select(date_sub(col("a"), 1)), + expected, + sort = false) + } } class EagerFunctionSuite extends FunctionSuite with EagerSession