From d4d6f1de83628b12e4a27d273edace7762f69860 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 11 Sep 2024 13:51:56 -0400 Subject: [PATCH] [ruby/prism] UTF-8 characters in file name https://github.com/ruby/prism/commit/487f0ffe78 --- prism/extension.c | 28 ++++++++---- prism/util/pm_string.c | 89 ++++++++++++++++++++++++++---------- test/prism/api/parse_test.rb | 10 ++++ 3 files changed, 93 insertions(+), 34 deletions(-) diff --git a/prism/extension.c b/prism/extension.c index cd5165f41a..6a8fedcb05 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -254,14 +254,14 @@ string_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options) * Read options for methods that look like (filepath, **options). */ static void -file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options) { +file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, VALUE *encoded_filepath) { VALUE filepath; VALUE keywords; rb_scan_args(argc, argv, "1:", &filepath, &keywords); Check_Type(filepath, T_STRING); - - extract_options(options, filepath, keywords); + *encoded_filepath = rb_str_encode_ospath(filepath); + extract_options(options, *encoded_filepath, keywords); const char * string_source = (const char *) pm_string_source(&options->filepath); @@ -352,7 +352,8 @@ dump_file(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE value = dump_input(&input, &options); pm_string_free(&input); @@ -685,7 +686,8 @@ lex_file(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE value = parse_lex_input(&input, &options, false); pm_string_free(&input); @@ -782,7 +784,8 @@ parse_file(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE value = parse_input(&input, &options); pm_string_free(&input); @@ -838,7 +841,9 @@ profile_file(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + profile_input(&input, &options); pm_string_free(&input); pm_options_free(&options); @@ -952,7 +957,8 @@ parse_file_comments(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE value = parse_input_comments(&input, &options); pm_string_free(&input); @@ -1007,7 +1013,8 @@ parse_lex_file(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE value = parse_lex_input(&input, &options, true); pm_string_free(&input); @@ -1077,7 +1084,8 @@ parse_file_success_p(int argc, VALUE *argv, VALUE self) { pm_string_t input; pm_options_t options = { 0 }; - file_options(argc, argv, &input, &options); + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); VALUE result = parse_input_success_p(&input, &options); pm_string_free(&input); diff --git a/prism/util/pm_string.c b/prism/util/pm_string.c index 4bd3dd8ef2..3e1e22e34f 100644 --- a/prism/util/pm_string.c +++ b/prism/util/pm_string.c @@ -47,6 +47,53 @@ pm_string_constant_init(pm_string_t *string, const char *source, size_t length) }; } +#ifdef _WIN32 +/** + * Represents a file handle on Windows, where the path will need to be freed + * when the file is closed. + */ +typedef struct { + /** The path to the file, which will become allocated memory. */ + WCHAR *path; + + /** The handle to the file, which will start as uninitialized memory. */ + HANDLE file; +} pm_string_file_handle_t; + +/** + * Open the file indicated by the filepath parameter for reading on Windows. + * Perform any kind of normalization that needs to happen on the filepath. + */ +static bool +pm_string_file_handle_open(pm_string_file_handle_t *handle, const char *filepath) { + int length = MultiByteToWideChar(CP_UTF8, 0, filepath, -1, NULL, 0); + if (length == 0) return false; + + handle->path = xmalloc(sizeof(WCHAR) * ((size_t) length)); + if ((handle->path == NULL) || (MultiByteToWideChar(CP_UTF8, 0, filepath, -1, handle->path, length) == 0)) { + xfree(handle->path); + return false; + } + + handle->file = CreateFileW(handle->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); + if (handle->file == INVALID_HANDLE_VALUE) { + xfree(handle->path); + return false; + } + + return true; +} + +/** + * Close the file handle and free the path. + */ +static void +pm_string_file_handle_close(pm_string_file_handle_t *handle) { + xfree(handle->path); + CloseHandle(handle->file); +} +#endif + /** * Read the file indicated by the filepath parameter into source and load its * contents and size into the given `pm_string_t`. The given `pm_string_t` @@ -62,39 +109,36 @@ PRISM_EXPORTED_FUNCTION bool pm_string_mapped_init(pm_string_t *string, const char *filepath) { #ifdef _WIN32 // Open the file for reading. - HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); - - if (file == INVALID_HANDLE_VALUE) { - return false; - } + pm_string_file_handle_t handle; + if (!pm_string_file_handle_open(&handle, filepath)) return false; // Get the file size. - DWORD file_size = GetFileSize(file, NULL); + DWORD file_size = GetFileSize(handle.file, NULL); if (file_size == INVALID_FILE_SIZE) { - CloseHandle(file); + pm_string_file_handle_close(&handle); return false; } // If the file is empty, then we don't need to do anything else, we'll set // the source to a constant empty string and return. if (file_size == 0) { - CloseHandle(file); + pm_string_file_handle_close(&handle); const uint8_t source[] = ""; *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; return true; } // Create a mapping of the file. - HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL); + HANDLE mapping = CreateFileMapping(handle.file, NULL, PAGE_READONLY, 0, 0, NULL); if (mapping == NULL) { - CloseHandle(file); + pm_string_file_handle_close(&handle); return false; } // Map the file into memory. uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); CloseHandle(mapping); - CloseHandle(file); + pm_string_file_handle_close(&handle); if (source == NULL) { return false; @@ -156,23 +200,20 @@ PRISM_EXPORTED_FUNCTION bool pm_string_file_init(pm_string_t *string, const char *filepath) { #ifdef _WIN32 // Open the file for reading. - HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); - - if (file == INVALID_HANDLE_VALUE) { - return false; - } + pm_string_file_handle_t handle; + if (!pm_string_file_handle_open(&handle, filepath)) return false; // Get the file size. - DWORD file_size = GetFileSize(file, NULL); + DWORD file_size = GetFileSize(handle.file, NULL); if (file_size == INVALID_FILE_SIZE) { - CloseHandle(file); + pm_string_file_handle_close(&handle); return false; } // If the file is empty, then we don't need to do anything else, we'll set // the source to a constant empty string and return. if (file_size == 0) { - CloseHandle(file); + pm_string_file_handle_close(&handle); const uint8_t source[] = ""; *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; return true; @@ -181,25 +222,25 @@ pm_string_file_init(pm_string_t *string, const char *filepath) { // Create a buffer to read the file into. uint8_t *source = xmalloc(file_size); if (source == NULL) { - CloseHandle(file); + pm_string_file_handle_close(&handle); return false; } // Read the contents of the file DWORD bytes_read; - if (!ReadFile(file, source, file_size, &bytes_read, NULL)) { - CloseHandle(file); + if (!ReadFile(handle.file, source, file_size, &bytes_read, NULL)) { + pm_string_file_handle_close(&handle); return false; } // Check the number of bytes read if (bytes_read != file_size) { xfree(source); - CloseHandle(file); + pm_string_file_handle_close(&handle); return false; } - CloseHandle(file); + pm_string_file_handle_close(&handle); *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size }; return true; #elif defined(PRISM_HAS_FILESYSTEM) diff --git a/test/prism/api/parse_test.rb b/test/prism/api/parse_test.rb index 2e9722f5da..6ad3829de0 100644 --- a/test/prism/api/parse_test.rb +++ b/test/prism/api/parse_test.rb @@ -69,6 +69,16 @@ module Prism end end + if RUBY_ENGINE != "truffleruby" + def test_parse_nonascii + Dir.mktmpdir do |dir| + path = File.join(dir, "\u{3042 3044 3046 3048 304a}.rb".encode(Encoding::Windows_31J)) + File.write(path, "ok") + Prism.parse_file(path) + end + end + end + private def find_source_file_node(program)