Skip to content

Instantly share code, notes, and snippets.

@rcgale
Last active July 26, 2023 21:23
Show Gist options
  • Select an option

  • Save rcgale/935c87ce61dc1c93e0ed6801c36adefe to your computer and use it in GitHub Desktop.

Select an option

Save rcgale/935c87ce61dc1c93e0ed6801c36adefe to your computer and use it in GitHub Desktop.
Dataset loading in python (`search_dir()`)
def search_dir(d, pattern, match_full_path=False) -> "Iterator[Tuple[str, Tuple[str]]]":
"""
Performs an `os.walk` and returns an iterator of Tuple[str, Match] for each file matching a regex pattern.
Note that the /(groups)/ found in a regex match can be unpacked like a tuple, so enjoy!
https://gist.github.com/rcgale/935c87ce61dc1c93e0ed6801c36adefe
:param d: Directory to start in
:param pattern: A str or re.Pattern to match to files
:param match_full_path: If true, match regex against full path. If false, match regex against path relative to `d`.
:return:
"""
import os
import re
d = os.path.expanduser(d)
if not os.path.exists(d):
raise FileNotFoundError(f"Cannot search directory which does not exist: {d}")
pattern = re.compile(pattern) if isinstance(pattern, str) else pattern
for basedir, directories, files in os.walk(d):
for file in files:
file_path = os.path.join(basedir, file)
if match_full_path:
match_path = file_path
else:
match_path = os.path.relpath(os.path.join(basedir, file), d)
for match in re.findall(pattern, match_path):
yield file_path, match
break
### Example Usage ###
import re
YESNO_CORPUS = './waves_yesno' # https://www.openslr.org/1/
WAV_PATTERN = re.compile('([01])_([01])_([01])_([01])_([01])_([01])_([01])_([01]).wav')
for filename, (yn1, yn2, yn3, yn4, yn5, yn6, yn7, yn8) in search_dir(YESNO_CORPUS, WAV_PATTERN):
num_y = len([v for v in (yn1, yn2, yn3, yn4, yn5, yn6, yn7, yn8) if int(v) == 1])
print(f'Expect {num_y} x yes, {8-num_y} x no in {filename}.')
'''
# The above prints:
Expect 2 x yes, 6 x no in ./waves_yesno/0_0_1_0_0_0_1_0.wav.
Expect 5 x yes, 3 x no in ./waves_yesno/1_1_1_0_1_0_1_0.wav.
Expect 6 x yes, 2 x no in ./waves_yesno/1_1_1_0_1_0_1_1.wav.
Expect 5 x yes, 3 x no in ./waves_yesno/1_1_0_1_1_0_0_1.wav.
# ... etc.
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment