Skip to content

Commit

Permalink
Dazfuller/threshold bytes (#22)
Browse files Browse the repository at this point in the history
* Rename maxBytesForTempFiles to thresholdBytesForTempFiles

* update to include newer spark versions
  • Loading branch information
dazfuller authored Aug 20, 2022
1 parent 21083e7 commit e94d141
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/spark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
build:
strategy:
matrix:
sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1 ]
sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.3.0 ]

runs-on: ubuntu-latest

Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,16 @@ val globFileDF = spark.read

The library supports the following options:

Option | Type | Default | Description
-------------------- | ------- | -------- | -----------
cellAddress | String | A1 | Location of the first cell of the table (including header)
headerRowCount | Int | 1 | Number of rows which make up the header. If no header is available then set this value to 0 (zero)
includeSheetName | Boolean | False | Includes the name of the worksheet the data has come from when set to true. Uses the column `_SheetName`
workbookPassword | String | _Empty_ | Password required to open Excel workbook
sheetNamePattern | String | _Empty_ | Regular expression to use to match worksheet names
maxRowCount | Int | 1000 | Number of records to read to infer the schema. If set to 0 (zero) then all available rows will be read
maxBytesForTempFiles | Int | 10000000 | Sets the number of bytes at which a workbook is (ooxml format) is regarded as too large to hold in memory and the data is put into temp files instead. Whilst the cluster may have large volumes of memory, the node processing the file will be limited.
| Option | Type | Default | Description |
|----------------------------|---------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| cellAddress | String | A1 | Location of the first cell of the table (including header) |
| headerRowCount | Int | 1 | Number of rows which make up the header. If no header is available then set this value to 0 (zero) |
| includeSheetName | Boolean | False | Includes the name of the worksheet the data has come from when set to true. Uses the column `_SheetName` |
| workbookPassword | String | _Empty_ | Password required to open Excel workbook |
| sheetNamePattern | String | _Empty_ | Regular expression to use to match worksheet names |
| maxRowCount | Int | 1000 | Number of records to read to infer the schema. If set to 0 (zero) then all available rows will be read |
| maxBytesForTempFiles | Int | 10000000 | Sets the number of bytes at which a workbook is (ooxml format) is regarded as too large to hold in memory and the data is put into temp files instead. Whilst the cluster may have large volumes of memory, the node processing the file will be limited. |
| thresholdBytesForTempFiles | Int | 10000000 | _Alias for maxBytesForTempFiles_ |

```scala
val df = spark.read
Expand All @@ -105,7 +106,7 @@ val df = spark.read
.option("workbookPassword", "AP@55w0rd") // Use this password to open the workbook with
.option("sheetNamePattern", """Sheet[13]""") // Read data from all sheets matching this pattern (e.g. Sheet1 and Sheet3)
.option("maxRowCount", 10) // Read only the first 10 records to determine the schema of the data
.option("maxBytesForTempFiles", 50000000) // Set size limit before temp files are used
.option("thresholdBytesForTempFiles", 50000000) // Set size limit before temp files are used
.load("/path/to/file.xlsx")
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ private[excel] case class ExcelParserOptions(workbookPassword: Option[String] =
headerRowCount: Int = 1,
maxRowCount: Int = 1000,
includeSheetName: Boolean = false,
maxBytesForTempFiles: Int = 100000000)
thresholdBytesForTempFiles: Int = 100000000)

private[excel] object ExcelParserOptions {
private val encoder = new DoubleMetaphone()
Expand All @@ -54,11 +54,13 @@ private[excel] object ExcelParserOptions {
encoder.encode("maxRowCount") -> "maxRowCount",
encoder.encode("includeSheetName") -> "includeSheetName",
encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles",
encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles"
)

/**
* Checks the provided set of keys for invalid options and attempts to match again
* valid options.
*
* @param keys collection of keys to valid
* @return An [[Option]] containing a string if there are errors, or [[None]]
*/
Expand Down Expand Up @@ -93,14 +95,16 @@ private[excel] object ExcelParserOptions {
None
}

val thresholdBytesForTempFiles = options.getInt("thresholdBytesForTempFiles", options.getInt("maxBytesForTempFiles", 100000000))

ExcelParserOptions(
worksheetPassword,
options.getOrDefault("sheetNamePattern", ""),
options.getOrDefault("cellAddress", "A1"),
options.getInt("headerRowCount", 1),
options.getInt("maxRowCount", 1000),
options.getBoolean("includeSheetName", false),
options.getInt("maxBytesForTempFiles", 100000000)
thresholdBytesForTempFiles
)
}

Expand All @@ -122,14 +126,16 @@ private[excel] object ExcelParserOptions {
None
}

val thresholdBytesForTempFiles = options.getOrElse("thresholdBytesForTempFiles", options.getOrElse("maxBytesForTempFiles", "100000000"))

ExcelParserOptions(
worksheetPassword,
options.getOrElse("sheetNamePattern", ""),
options.getOrElse("cellAddress", "A1"),
options.getOrElse("headerRowCount", "1").toInt,
options.getOrElse("maxRowCount", "1000").toInt,
options.getOrElse("includeSheetName", "false").toBoolean,
options.getOrElse("maxBytesForTempFiles", "100000000").toInt
thresholdBytesForTempFiles.toInt
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
}

ZipSecureFile.setMinInflateRatio(0)
ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(options.maxBytesForTempFiles)
ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(options.thresholdBytesForTempFiles)

options.workbookPassword match {
case Some(password) => WorkbookFactory.create(inputStream, password)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.headerRowCount should be(1)
options.maxRowCount should be(1000)
options.includeSheetName should be(false)
options.maxBytesForTempFiles should be(100000000)
options.thresholdBytesForTempFiles should be(100000000)
}

"Creating from a case insensitive map" should "use default values for an empty map" in {
Expand All @@ -30,7 +30,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.headerRowCount should be(1)
options.maxRowCount should be(1000)
options.includeSheetName should be(false)
options.maxBytesForTempFiles should be(100000000)
options.thresholdBytesForTempFiles should be(100000000)
}

it should "extract values from the map" in {
Expand All @@ -52,7 +52,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.headerRowCount should be(12)
options.maxRowCount should be(2000)
options.includeSheetName should be(true)
options.maxBytesForTempFiles should be(10)
options.thresholdBytesForTempFiles should be(10)
}

it should "provide useful error information if options are slightly mis-spelt" in {
Expand Down Expand Up @@ -87,6 +87,27 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.workbookPassword should be(None)
}

it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is not provided" in {
val input = new CaseInsensitiveStringMap(Map[String, String](
"thresholdBytesForTempFiles" -> "100"
).asJava)

val options = ExcelParserOptions.from(input)

options.thresholdBytesForTempFiles should be(100)
}

it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is alo specified" in {
val input = new CaseInsensitiveStringMap(Map[String, String](
"thresholdBytesForTempFiles" -> "100",
"maxBytesForTempFiles" -> "120"
).asJava)

val options = ExcelParserOptions.from(input)

options.thresholdBytesForTempFiles should be(100)
}

"Creating from a string map" should "use default values for an empty map" in {
val input = Map[String, String]()

Expand All @@ -98,7 +119,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.headerRowCount should be(1)
options.maxRowCount should be(1000)
options.includeSheetName should be(false)
options.maxBytesForTempFiles should be(100000000)
options.thresholdBytesForTempFiles should be(100000000)
}

it should "extract values from the map" in {
Expand All @@ -120,6 +141,27 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers {
options.headerRowCount should be(12)
options.maxRowCount should be(2000)
options.includeSheetName should be(true)
options.maxBytesForTempFiles should be(100)
options.thresholdBytesForTempFiles should be(100)
}

it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is not provided" in {
val input = Map[String, String](
"thresholdBytesForTempFiles" -> "100"
)

val options = ExcelParserOptions.from(input)

options.thresholdBytesForTempFiles should be(100)
}

it should "use thresholdBytesForTempFiles if maxBytesForTempFiles is alo specified" in {
val input = Map[String, String](
"thresholdBytesForTempFiles" -> "100",
"maxBytesForTempFiles" -> "120"
)

val options = ExcelParserOptions.from(input)

options.thresholdBytesForTempFiles should be(100)
}
}

0 comments on commit e94d141

Please sign in to comment.