diff --git a/.github/workflows/spark.yml b/.github/workflows/spark.yml index 0734d29..4a38cf6 100644 --- a/.github/workflows/spark.yml +++ b/.github/workflows/spark.yml @@ -12,7 +12,7 @@ jobs: build: strategy: matrix: - sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.4.0 ] + sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.5.0 ] runs-on: ubuntu-latest diff --git a/build.ps1 b/build.ps1 index 53a6ba1..621a8e4 100644 --- a/build.ps1 +++ b/build.ps1 @@ -14,7 +14,7 @@ https://www.elastacloud.com #> -$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0") +$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1") $jarPath = "./target/jars" $covPath = "./target/coverage" diff --git a/build.sbt b/build.sbt index 9519074..8847057 100644 --- a/build.sbt +++ b/build.sbt @@ -23,6 +23,7 @@ val sparkVersion = settingKey[String]("Spark version") val sparkExcelVersion = settingKey[String]("Version of the Spark Excel library") val scalaTestVersion = settingKey[String]("ScalaTest version") val poiVersion = settingKey[String]("Apache POI version") +val log4JVersion = settingKey[String]("Apache Log4J version") name := "spark-excel" organization := "com.elastacloud" @@ -61,7 +62,9 @@ libraryDependencies ++= Seq( "org.apache.poi" % "poi-ooxml-lite" % poiVersion.value % Compile, "org.apache.commons" % "commons-compress" % "1.21" % Compile, "org.apache.commons" % "commons-collections4" % "4.4" % Compile, - "commons-io" % "commons-io" % "2.11.0" % Compile + "commons-io" % "commons-io" % "2.11.0" % Compile, + "org.apache.logging.log4j" % "log4j-core" % log4JVersion.value % Compile, + "org.apache.logging.log4j" % "log4j-api" % log4JVersion.value % Compile ) // Setup test dependencies and configuration @@ -117,8 +120,8 @@ addArtifact(Compile / assembly / artifact, assembly) // Define common settings for the library val commonSettings = Seq( - sparkVersion := System.getProperty("sparkVersion", "3.4.0"), - sparkExcelVersion := "0.1.11", + sparkVersion := System.getProperty("sparkVersion", "3.5.0"), + sparkExcelVersion := "0.1.12", version := s"${sparkVersion.value}_${sparkExcelVersion.value}", scalaVersion := { if (sparkVersion.value < "3.2.0") { @@ -130,6 +133,7 @@ val commonSettings = Seq( } }, scalaTestVersion := "3.2.16", - poiVersion := "5.2.2", + poiVersion := "5.2.3", + log4JVersion := "2.20.0", crossVersion := CrossVersion.disabled ) diff --git a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala index f52c6f4..55564e3 100644 --- a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala +++ b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala @@ -60,6 +60,7 @@ private[excel] class ExcelParserOptions( val headerRowCount: Int = parameters.getOrElse("headerRowCount", "1").toInt val maxRowCount: Int = parameters.getOrElse("maxRowCount", "1000").toInt val includeSheetName: Boolean = parameters.getOrElse("includeSheetName", "false").toBoolean + val nulLValue: Option[String] = parameters.get("nullValue") val thresholdBytesForTempFiles: Int = parameters.getOrElse("thresholdBytesForTempFiles", parameters.getOrElse("maxBytesForTempFiles", "100000000")).toInt val schemaMatchColumnName: String = parameters.getOrElse("schemaMatchColumnName", null) @@ -83,6 +84,7 @@ private[excel] object ExcelParserOptions { encoder.encode("headerRowCount") -> "headerRowCount", encoder.encode("maxRowCount") -> "maxRowCount", encoder.encode("includeSheetName") -> "includeSheetName", + encoder.encode("nullValue") -> "nullValue", encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles", encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles", encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName" diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala index 04a45ab..e7ab677 100644 --- a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala +++ b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala @@ -292,7 +292,12 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO case _ => (null, false) } case CellType.STRING => targetType match { - case _: StringType => (UTF8String.fromString(currentCellValue.getStringValue), true) + case _: StringType => + val cellStringValue = UTF8String.fromString(currentCellValue.getStringValue) + options.nulLValue match { + case Some(nullValue) if cellStringValue.toString.equalsIgnoreCase(nullValue) => (null, true) + case _ => (cellStringValue, true) + } case _ => (null, false) } case _ => (UTF8String.fromString(currentCellValue.toString), true) diff --git a/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala b/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala index 93b7b48..11f49ae 100644 --- a/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala @@ -1,3 +1,19 @@ +/* + * Copyright 2021 Elastacloud Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.elastacloud.spark.excel import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -16,6 +32,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -31,6 +48,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -43,6 +61,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "12", "maxRowCount" -> "2000", "includeSheetName" -> "true", + "nullValue" -> "NA", "maxBytesForTempFiles" -> "10", "schemaMatchColumnName" -> "_isValid" ).asJava) @@ -55,6 +74,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(12) options.maxRowCount should be(2000) options.includeSheetName should be(true) + options.nulLValue should be(Some("NA")) options.thresholdBytesForTempFiles should be(10) options.schemaMatchColumnName should be("_isValid") } @@ -67,6 +87,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerCount" -> "12", "maxRowCont" -> "2000", "includShetNam" -> "true", + "nulvalue" -> "NA", "macsBitesTempFiles" -> "10", "schemaMatchColumName" -> "_isValid" ).asJava) @@ -79,6 +100,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { exception.getMessage.contains("Invalid option 'headercount', did you mean 'headerRowCount'?") should be(true) exception.getMessage.contains("Invalid option 'maxrowcont', did you mean 'maxRowCount'?") should be(true) exception.getMessage.contains("Invalid option 'includshetnam', did you mean 'includeSheetName'?") should be(true) + exception.getMessage.contains("Invalid option 'nulvalue', did you mean 'nullValue'?") should be(true) exception.getMessage.contains("Invalid option 'macsbitestempfiles', did you mean 'maxBytesForTempFiles'") should be(true) exception.getMessage.contains("Invalid option 'schemamatchcolumname', did you mean 'schemaMatchColumnName'") should be(true) } @@ -125,6 +147,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -137,6 +160,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "12", "maxRowCount" -> "2000", "includeSheetName" -> "true", + "nullValue" -> "NA", "maxBytesForTempFiles" -> "100", "schemaMatchColumnName" -> "_isValid" ) @@ -149,6 +173,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(12) options.maxRowCount should be(2000) options.includeSheetName should be(true) + options.nulLValue should be(Some("NA")) options.thresholdBytesForTempFiles should be(100) options.schemaMatchColumnName should be("_isValid") } @@ -191,6 +216,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "17", "maxRowCount" -> "5", "includeSheetName" -> "true", + "nullValue" -> "N/A", "thresholdBytesForTempFiles" -> "12", "schemaMatchColumnName" -> "matchesSchema" ) @@ -203,6 +229,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(17) options.maxRowCount should be(5) options.includeSheetName should be(true) + options.nulLValue should be(Some("N/A")) options.thresholdBytesForTempFiles should be(12) options.schemaMatchColumnName should be("matchesSchema") } diff --git a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala index 31442d9..b5f9606 100644 --- a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala @@ -591,4 +591,42 @@ class ExcelParserTests extends AnyFlatSpec with Matchers { the[ExcelParserException] thrownBy parser.getDataIterator.toList should have message "The specified schema match column is not defined as a boolean type." } } + + "Specifying a null value" should "read the string value as null" in { + withInputStream("/Parser/SimpleWorkbook.xlsx") { inputStream => + val options = new ExcelParserOptions(Map[String, String]( + "nullValue" -> "y" + )) + + val expectedData = Seq( + Vector[Any]("a".asUnsafe, 1D, "x".asUnsafe), + Vector[Any]("b".asUnsafe, 2D, null), + Vector[Any]("c".asUnsafe, 3D, "z".asUnsafe) + ) + + val parser = new ExcelParser(inputStream, options) + val actualData = parser.getDataIterator.toList + + actualData should equal(expectedData) + } + } + + it should "Handle string concatenation formulas" in { + withInputStream("/Parser/ConcatString.xlsx") { inputStream => + val options = new ExcelParserOptions(Map[String, String]( + "nullValue" -> "MR ADAM FOX" + )) + + val expectedData = Seq( + Vector[Any]("Dr".asUnsafe, "Jennifer".asUnsafe, "Alagora".asUnsafe, "Dr Jennifer Alagora".asUnsafe), + Vector[Any]("Mr".asUnsafe, "Adam".asUnsafe, "Fox".asUnsafe, null), + Vector[Any]("Ms".asUnsafe, null, "Proctor".asUnsafe, "Ms Proctor".asUnsafe) + ) + + val parser = new ExcelParser(inputStream, options) + + val actualData = parser.getDataIterator.toList + actualData should equal(expectedData) + } + } }