Skip to content

Commit

Permalink
pythonGH-115060: Speed up pathlib.Path.glob() by removing redundant…
Browse files Browse the repository at this point in the history
… regex matching

When expanding and filtering paths for a `**` wildcard segment, build an
`re.Pattern` object from the subsequent pattern parts, rather than the
entire pattern.

Also skip compiling a pattern when expanding a `*` wildcard segment.
  • Loading branch information
barneygale committed Feb 6, 2024
1 parent 1b1f839 commit 54c5aa5
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 14 deletions.
26 changes: 12 additions & 14 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,21 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
continue
except OSError:
continue
if match(entry.name):
if match is None or match(entry.name):
yield parent_path._make_child_entry(entry)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
"""Yield given paths and all their subdirectories, recursively."""
if follow_symlinks is None:
follow_symlinks = False
for parent_path in parent_paths:
prefix_len = len(str(parent_path._make_child_relpath('_'))) - 1
paths = [parent_path._make_child_relpath('')]
while paths:
path = paths.pop()
yield path
if match is None or match(str(path), prefix_len):
yield path
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
Expand All @@ -115,7 +117,9 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
except OSError:
pass
if not dir_only:
yield path._make_child_entry(entry)
file_path = path._make_child_entry(entry)
if match is None or match(str(file_path), prefix_len):
yield file_path


def _select_unique(paths):
Expand Down Expand Up @@ -769,7 +773,6 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):

stack = pattern._pattern_stack
specials = ('', '.', '..')
filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
Expand All @@ -786,11 +789,11 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
# regex filtering, provided we're treating symlinks consistently.
if follow_symlinks is not None:
while stack and stack[-1] not in specials:
filter_paths = True
stack.pop()
part += sep + stack.pop()

dir_only = bool(stack)
paths = _select_recursive(paths, dir_only, follow_symlinks)
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
paths = _select_recursive(paths, dir_only, follow_symlinks, match)
if deduplicate_paths:
# De-duplicate if we've already seen a '**' component.
paths = _select_unique(paths)
Expand All @@ -799,13 +802,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive)
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
paths = _select_children(paths, dir_only, follow_symlinks, match)
if filter_paths:
# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(pattern._pattern_str, sep, case_sensitive)
paths = (path for path in paths if match(path._pattern_str, prefix_len))
return paths

def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching.

0 comments on commit 54c5aa5

Please sign in to comment.