From ac34f58c0c4e8f94d94a195529f2ff502b1004a4 Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Thu, 6 Feb 2025 17:37:02 -0800 Subject: [PATCH] Writer/Xls/Parser::advance Should Parse by Character It currently parsed by byte, which is not a good thing in a UTF-8 system. See the discussion at the bottom of PR #4203. That turns out to not be the change which led to this problem; that would have been PR #4323. That change came about because using Composer/Pcre revealed bugs in several regular expressions used in Writer/Xls/Parser. This ticket comes about because more bugs were revealed in the same module. The problem is that the `advance` method needs to process formulas character by character, but is instead doing it byte by byte. It is changed to advance by characters, and tests for non-ASCII characters are added. --- CHANGELOG.md | 1 + src/PhpSpreadsheet/Writer/Xls/Parser.php | 21 +++--- .../Writer/Xls/NonLatinFormulasTest.php | 65 +++++++++++++++++++ 3 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 tests/PhpSpreadsheetTests/Writer/Xls/NonLatinFormulasTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 959648a020..6599bbbea9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ and this project adheres to [Semantic Versioning](https://semver.org). ### Fixed - Xls writer Parser Mishandling True/False Argument. [Issue #4331](https://github.com/PHPOffice/PhpSpreadsheet/issues/4331) [PR #4333](https://github.com/PHPOffice/PhpSpreadsheet/pull/4333) +- Xls writer Parser Parse By Character Not Byte. [PR #4344](https://github.com/PHPOffice/PhpSpreadsheet/pull/4344) ## 2025-01-26 - 3.9.0 diff --git a/src/PhpSpreadsheet/Writer/Xls/Parser.php b/src/PhpSpreadsheet/Writer/Xls/Parser.php index ec1114a4a3..5324a7bf51 100644 --- a/src/PhpSpreadsheet/Writer/Xls/Parser.php +++ b/src/PhpSpreadsheet/Writer/Xls/Parser.php @@ -67,6 +67,8 @@ class Parser . '[$]?[A-Ia-i]?[A-Za-z][$]?(\\d+)' . '$~u'; + private const UTF8 = 'UTF-8'; + /** * The index of the character we are currently looking at. */ @@ -991,29 +993,30 @@ private function advance(): void { $token = ''; $i = $this->currentCharacter; - $formula_length = strlen($this->formula); + $formula = mb_str_split($this->formula, 1, self::UTF8); + $formula_length = count($formula); // eat up white spaces if ($i < $formula_length) { - while ($this->formula[$i] == ' ') { + while ($formula[$i] === ' ') { ++$i; } if ($i < ($formula_length - 1)) { - $this->lookAhead = $this->formula[$i + 1]; + $this->lookAhead = $formula[$i + 1]; } $token = ''; } while ($i < $formula_length) { - $token .= $this->formula[$i]; + $token .= $formula[$i]; if ($i < ($formula_length - 1)) { - $this->lookAhead = $this->formula[$i + 1]; + $this->lookAhead = $formula[$i + 1]; } else { $this->lookAhead = ''; } - if ($this->match($token) != '') { + if ($this->match($token) !== '') { $this->currentCharacter = $i + 1; $this->currentToken = $token; @@ -1021,7 +1024,7 @@ private function advance(): void } if ($i < ($formula_length - 2)) { - $this->lookAhead = $this->formula[$i + 2]; + $this->lookAhead = $formula[$i + 2]; } else { // if we run out of characters lookAhead becomes empty $this->lookAhead = ''; } @@ -1198,8 +1201,8 @@ private function match(string $token): string public function parse(string $formula): bool { $this->currentCharacter = 0; - $this->formula = (string) $formula; - $this->lookAhead = $formula[1] ?? ''; + $this->formula = $formula; + $this->lookAhead = mb_substr($formula, 1, 1, self::UTF8); $this->advance(); $this->parseTree = $this->condition(); diff --git a/tests/PhpSpreadsheetTests/Writer/Xls/NonLatinFormulasTest.php b/tests/PhpSpreadsheetTests/Writer/Xls/NonLatinFormulasTest.php new file mode 100644 index 0000000000..d8ec963181 --- /dev/null +++ b/tests/PhpSpreadsheetTests/Writer/Xls/NonLatinFormulasTest.php @@ -0,0 +1,65 @@ +getActiveSheet(); + + $validation = $worksheet->getCell('B1')->getDataValidation(); + $validation->setType(DataValidation::TYPE_LIST); + $validation->setErrorStyle(DataValidation::STYLE_STOP); + $validation->setAllowBlank(false); + $validation->setShowInputMessage(true); + $validation->setShowErrorMessage(true); + $validation->setShowDropDown(true); + $validation->setFormula1('"слово, сло"'); + + $dataValidator = new DataValidator(); + $worksheet->getCell('B1')->setValue('слово'); + self::assertTrue( + $dataValidator->isValid($worksheet->getCell('B1')) + ); + $worksheet->getCell('B1')->setValue('слов'); + self::assertFalse( + $dataValidator->isValid($worksheet->getCell('B1')) + ); + + $worksheet->setTitle('словслов'); + $worksheet->getCell('A1')->setValue('=словслов!B1'); + $worksheet->getCell('A2')->setValue("='словслов'!B1"); + $spreadsheet->addNamedRange(new NamedRange('слсл', $worksheet, '$B$1')); + $worksheet->getCell('A3')->setValue('=слсл'); + + $robj = $this->writeAndReload($spreadsheet, 'Xls'); + $spreadsheet->disconnectWorksheets(); + $sheet0 = $robj->getActiveSheet(); + self::assertSame('словслов', $sheet0->getTitle()); + self::assertSame('=словслов!B1', $sheet0->getCell('A1')->getValue()); + self::assertSame('слов', $sheet0->getCell('A1')->getCalculatedValue()); + // Quotes around sheet name are stripped off - harmless + //self::assertSame("='словслов'!B1", $sheet0->getCell('A2')->getValue()); + self::assertSame('слов', $sheet0->getCell('A2')->getCalculatedValue()); + // Formulas with defined names don't work in Xls Writer + //self::assertSame('=слсл', $sheet0->getCell('A3')->getValue()); + // But result should be accurate + self::assertSame('слов', $sheet0->getCell('A3')->getCalculatedValue()); + $names = $robj->getDefinedNames(); + self::assertCount(1, $names); + // name has been uppercased + $namedRange = $names['СЛСЛ'] ?? null; + self::assertInstanceOf(NamedRange::class, $namedRange); + self::assertSame('$B$1', $namedRange->getRange()); + + $robj->disconnectWorksheets(); + } +}