Dazfuller/threshold bytes (#22)

* Rename maxBytesForTempFiles to thresholdBytesForTempFiles * update to include newer spark versions
elastacloud · Aug 20, 2022 · e94d141 · e94d141
1 parent 21083e7
commit e94d141
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 20 deletions.
diff --git a/.github/workflows/spark.yml b/.github/workflows/spark.yml
@@ -12,7 +12,7 @@ jobs:
   build:
     strategy:
       matrix:
-        sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1 ]
+        sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.3.0 ]
 
     runs-on: ubuntu-latest
 

diff --git a/README.md b/README.md
@@ -86,15 +86,16 @@ val globFileDF = spark.read
 
 The library supports the following options:
 
-Option               | Type    | Default  | Description
--------------------- | ------- | -------- | -----------
-cellAddress          | String  | A1       | Location of the first cell of the table (including header)
-headerRowCount       | Int     | 1        | Number of rows which make up the header. If no header is available then set this value to 0 (zero)
-includeSheetName     | Boolean | False    | Includes the name of the worksheet the data has come from when set to true. Uses the column `_SheetName`
-workbookPassword     | String  | _Empty_  | Password required to open Excel workbook
-sheetNamePattern     | String  | _Empty_  | Regular expression to use to match worksheet names
-maxRowCount          | Int     | 1000     | Number of records to read to infer the schema. If set to 0 (zero) then all available rows will be read
-maxBytesForTempFiles | Int     | 10000000 | Sets the number of bytes at which a workbook is (ooxml format) is regarded as too large to hold in memory and the data is put into temp files instead. Whilst the cluster may have large volumes of memory, the node processing the file will be limited.
+| Option                     | Type    | Default  | Description                                                                                                                                                                                                                                               |
+|----------------------------|---------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| cellAddress                | String  | A1       | Location of the first cell of the table (including header)                                                                                                                                                                                                |
+| headerRowCount             | Int     | 1        | Number of rows which make up the header. If no header is available then set this value to 0 (zero)                                                                                                                                                        |
+| includeSheetName           | Boolean | False    | Includes the name of the worksheet the data has come from when set to true. Uses the column `_SheetName`                                                                                                                                                  |
+| workbookPassword           | String  | _Empty_  | Password required to open Excel workbook                                                                                                                                                                                                                  |
+| sheetNamePattern           | String  | _Empty_  | Regular expression to use to match worksheet names                                                                                                                                                                                                        |
+| maxRowCount                | Int     | 1000     | Number of records to read to infer the schema. If set to 0 (zero) then all available rows will be read                                                                                                                                                    |
+| maxBytesForTempFiles       | Int     | 10000000 | Sets the number of bytes at which a workbook is (ooxml format) is regarded as too large to hold in memory and the data is put into temp files instead. Whilst the cluster may have large volumes of memory, the node processing the file will be limited. |
+| thresholdBytesForTempFiles | Int     | 10000000 | _Alias for maxBytesForTempFiles_                                                                                                                                                                                                                          |
 
 ```scala
 val df = spark.read
@@ -105,7 +106,7 @@ val df = spark.read
   .option("workbookPassword", "AP@55w0rd") // Use this password to open the workbook with
   .option("sheetNamePattern", """Sheet[13]""") // Read data from all sheets matching this pattern (e.g. Sheet1 and Sheet3)
   .option("maxRowCount", 10) // Read only the first 10 records to determine the schema of the data
-  .option("maxBytesForTempFiles", 50000000) // Set size limit before temp files are used
+  .option("thresholdBytesForTempFiles", 50000000) // Set size limit before temp files are used
   .load("/path/to/file.xlsx")
 ```
 

diff --git a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala
@@ -38,7 +38,7 @@ private[excel] case class ExcelParserOptions(workbookPassword: Option[String] =
                                              headerRowCount: Int = 1,
                                              maxRowCount: Int = 1000,
                                              includeSheetName: Boolean = false,
-                                             maxBytesForTempFiles: Int = 100000000)
+                                             thresholdBytesForTempFiles: Int = 100000000)
 
 private[excel] object ExcelParserOptions {
   private val encoder = new DoubleMetaphone()
@@ -54,11 +54,13 @@ private[excel] object ExcelParserOptions {
     encoder.encode("maxRowCount") -> "maxRowCount",
     encoder.encode("includeSheetName") -> "includeSheetName",
     encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles",
+    encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles"
   )
 
   /**
    * Checks the provided set of keys for invalid options and attempts to match again
    * valid options.
+   *
    * @param keys collection of keys to valid
    * @return An [[Option]] containing a string if there are errors, or [[None]]
    */
@@ -93,14 +95,16 @@ private[excel] object ExcelParserOptions {
       None
     }
 
+    val thresholdBytesForTempFiles = options.getInt("thresholdBytesForTempFiles", options.getInt("maxBytesForTempFiles", 100000000))
+
     ExcelParserOptions(
       worksheetPassword,
       options.getOrDefault("sheetNamePattern", ""),
       options.getOrDefault("cellAddress", "A1"),
       options.getInt("headerRowCount", 1),
       options.getInt("maxRowCount", 1000),
       options.getBoolean("includeSheetName", false),
-      options.getInt("maxBytesForTempFiles", 100000000)
+      thresholdBytesForTempFiles
     )
   }
 
@@ -122,14 +126,16 @@ private[excel] object ExcelParserOptions {
       None
     }
 
+    val thresholdBytesForTempFiles = options.getOrElse("thresholdBytesForTempFiles", options.getOrElse("maxBytesForTempFiles", "100000000"))
+
     ExcelParserOptions(
       worksheetPassword,
       options.getOrElse("sheetNamePattern", ""),
       options.getOrElse("cellAddress", "A1"),
       options.getOrElse("headerRowCount", "1").toInt,
       options.getOrElse("maxRowCount", "1000").toInt,
       options.getOrElse("includeSheetName", "false").toBoolean,
-      options.getOrElse("maxBytesForTempFiles", "100000000").toInt
+      thresholdBytesForTempFiles.toInt
     )
   }
 }
diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala
@@ -55,7 +55,7 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
     }
 
     ZipSecureFile.setMinInflateRatio(0)
-    ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(options.maxBytesForTempFiles)
+    ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(options.thresholdBytesForTempFiles)
 
     options.workbookPassword match {
       case Some(password) => WorkbookFactory.create(inputStream, password)

diff --git a/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala b/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala
@@ -16,7 +16,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.headerRowCount should be(1)
     options.maxRowCount should be(1000)
     options.includeSheetName should be(false)
-    options.maxBytesForTempFiles should be(100000000)
+    options.thresholdBytesForTempFiles should be(100000000)
   }
 
   "Creating from a case insensitive map" should "use default values for an empty map" in {
@@ -30,7 +30,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.headerRowCount should be(1)
     options.maxRowCount should be(1000)
     options.includeSheetName should be(false)
-    options.maxBytesForTempFiles should be(100000000)
+    options.thresholdBytesForTempFiles should be(100000000)
   }
 
   it should "extract values from the map" in {
@@ -52,7 +52,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.headerRowCount should be(12)
     options.maxRowCount should be(2000)
     options.includeSheetName should be(true)
-    options.maxBytesForTempFiles should be(10)
+    options.thresholdBytesForTempFiles should be(10)
   }
 
   it should "provide useful error information if options are slightly mis-spelt" in {
@@ -87,6 +87,27 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.workbookPassword should be(None)
   }
 
+  it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is not provided" in {
+    val input = new CaseInsensitiveStringMap(Map[String, String](
+      "thresholdBytesForTempFiles" -> "100"
+    ).asJava)
+
+    val options = ExcelParserOptions.from(input)
+
+    options.thresholdBytesForTempFiles should be(100)
+  }
+
+  it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is alo specified" in {
+    val input = new CaseInsensitiveStringMap(Map[String, String](
+      "thresholdBytesForTempFiles" -> "100",
+      "maxBytesForTempFiles" -> "120"
+    ).asJava)
+
+    val options = ExcelParserOptions.from(input)
+
+    options.thresholdBytesForTempFiles should be(100)
+  }
+
   "Creating from a string map" should "use default values for an empty map" in {
     val input = Map[String, String]()
 
@@ -98,7 +119,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.headerRowCount should be(1)
     options.maxRowCount should be(1000)
     options.includeSheetName should be(false)
-    options.maxBytesForTempFiles should be(100000000)
+    options.thresholdBytesForTempFiles should be(100000000)
   }
 
   it should "extract values from the map" in {
@@ -120,6 +141,27 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
     options.headerRowCount should be(12)
     options.maxRowCount should be(2000)
     options.includeSheetName should be(true)
-    options.maxBytesForTempFiles should be(100)
+    options.thresholdBytesForTempFiles should be(100)
+  }
+
+  it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is not provided" in {
+    val input = Map[String, String](
+      "thresholdBytesForTempFiles" -> "100"
+    )
+
+    val options = ExcelParserOptions.from(input)
+
+    options.thresholdBytesForTempFiles should be(100)
+  }
+
+  it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is alo specified" in {
+    val input = Map[String, String](
+      "thresholdBytesForTempFiles" -> "100",
+      "maxBytesForTempFiles" -> "120"
+    )
+
+    val options = ExcelParserOptions.from(input)
+
+    options.thresholdBytesForTempFiles should be(100)
   }
 }