Last active
July 26, 2023 21:23
-
-
Save rcgale/935c87ce61dc1c93e0ed6801c36adefe to your computer and use it in GitHub Desktop.
Dataset loading in python (`search_dir()`)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def search_dir(d, pattern, match_full_path=False) -> "Iterator[Tuple[str, Tuple[str]]]": | |
| """ | |
| Performs an `os.walk` and returns an iterator of Tuple[str, Match] for each file matching a regex pattern. | |
| Note that the /(groups)/ found in a regex match can be unpacked like a tuple, so enjoy! | |
| https://gist.github.com/rcgale/935c87ce61dc1c93e0ed6801c36adefe | |
| :param d: Directory to start in | |
| :param pattern: A str or re.Pattern to match to files | |
| :param match_full_path: If true, match regex against full path. If false, match regex against path relative to `d`. | |
| :return: | |
| """ | |
| import os | |
| import re | |
| d = os.path.expanduser(d) | |
| if not os.path.exists(d): | |
| raise FileNotFoundError(f"Cannot search directory which does not exist: {d}") | |
| pattern = re.compile(pattern) if isinstance(pattern, str) else pattern | |
| for basedir, directories, files in os.walk(d): | |
| for file in files: | |
| file_path = os.path.join(basedir, file) | |
| if match_full_path: | |
| match_path = file_path | |
| else: | |
| match_path = os.path.relpath(os.path.join(basedir, file), d) | |
| for match in re.findall(pattern, match_path): | |
| yield file_path, match | |
| break | |
| ### Example Usage ### | |
| import re | |
| YESNO_CORPUS = './waves_yesno' # https://www.openslr.org/1/ | |
| WAV_PATTERN = re.compile('([01])_([01])_([01])_([01])_([01])_([01])_([01])_([01]).wav') | |
| for filename, (yn1, yn2, yn3, yn4, yn5, yn6, yn7, yn8) in search_dir(YESNO_CORPUS, WAV_PATTERN): | |
| num_y = len([v for v in (yn1, yn2, yn3, yn4, yn5, yn6, yn7, yn8) if int(v) == 1]) | |
| print(f'Expect {num_y} x yes, {8-num_y} x no in {filename}.') | |
| ''' | |
| # The above prints: | |
| Expect 2 x yes, 6 x no in ./waves_yesno/0_0_1_0_0_0_1_0.wav. | |
| Expect 5 x yes, 3 x no in ./waves_yesno/1_1_1_0_1_0_1_0.wav. | |
| Expect 6 x yes, 2 x no in ./waves_yesno/1_1_1_0_1_0_1_1.wav. | |
| Expect 5 x yes, 3 x no in ./waves_yesno/1_1_0_1_1_0_0_1.wav. | |
| # ... etc. | |
| ''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment