Bug 1641504: Bump Cranelift to e3d89c8a92a5fadedd75359b8485d23ac45ecf29. r=bbouvier

Differential Revision: https://phabricator.services.mozilla.com/D78587
2020-06-09 22:37:06 +00:00 · 2020-06-09 22:37:06 +00:00 · 6944be383a
--- a/.cargo/config.in
+++ b/.cargo/config.in
@ -60,7 +60,7 @@ rev = "3224e2dee65c0726c448484d4c3c43956b9330ec"
 [source."https://github.com/bytecodealliance/wasmtime"]
 git = "https://github.com/bytecodealliance/wasmtime"
 replace-with = "vendored-sources"
-rev = "b7cfd39b531680217537cfcf5294a22077a0a58d"
+rev = "e3d89c8a92a5fadedd75359b8485d23ac45ecf29"

 [source."https://github.com/badboy/failure"]
 git = "https://github.com/badboy/failure"
--- a/Cargo.lock
+++ b/Cargo.lock
@ -763,22 +763,22 @@ dependencies = [

 [[package]]
 name = "cranelift-bforest"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
 dependencies = [
- "cranelift-entity 0.63.0",
+ "cranelift-entity 0.64.0",
 ]

 [[package]]
 name = "cranelift-codegen"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
 dependencies = [
 "byteorder",
 "cranelift-bforest",
 "cranelift-codegen-meta",
 "cranelift-codegen-shared",
- "cranelift-entity 0.63.0",
+ "cranelift-entity 0.64.0",
 "log",
 "regalloc",
 "smallvec",
@ -788,17 +788,17 @@ dependencies = [

 [[package]]
 name = "cranelift-codegen-meta"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
 dependencies = [
 "cranelift-codegen-shared",
- "cranelift-entity 0.63.0",
+ "cranelift-entity 0.64.0",
 ]

 [[package]]
 name = "cranelift-codegen-shared"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"

 [[package]]
 name = "cranelift-entity"
@ -807,13 +807,13 @@ source = "git+https://github.com/PLSysSec/lucet_sandbox_compiler?rev=5e870faf6f9

 [[package]]
 name = "cranelift-entity"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"

 [[package]]
 name = "cranelift-frontend"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
 dependencies = [
 "cranelift-codegen",
 "log",
@ -823,15 +823,15 @@ dependencies = [

 [[package]]
 name = "cranelift-wasm"
-version = "0.63.0"
-source = "git+https://github.com/bytecodealliance/wasmtime?rev=b7cfd39b531680217537cfcf5294a22077a0a58d#b7cfd39b531680217537cfcf5294a22077a0a58d"
+version = "0.64.0"
+source = "git+https://github.com/bytecodealliance/wasmtime?rev=e3d89c8a92a5fadedd75359b8485d23ac45ecf29#e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
 dependencies = [
 "cranelift-codegen",
- "cranelift-entity 0.63.0",
+ "cranelift-entity 0.64.0",
 "cranelift-frontend",
 "log",
 "thiserror",
- "wasmparser 0.51.4",
+ "wasmparser 0.57.0",
 ]

 [[package]]
@ -3956,9 +3956,9 @@ dependencies = [

 [[package]]
 name = "regalloc"
-version = "0.0.21"
+version = "0.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b27b256b41986ac5141b37b8bbba85d314fbf546c182eb255af6720e07e4f804"
+checksum = "cca5b48c9db66c5ba084e4660b4c0cfe8b551a96074bc04b7c11de86ad0bf1f9"
 dependencies = [
 "log",
 "rustc-hash",
@ -5375,9 +5375,9 @@ checksum = "073da89bf1c84db000dd68ce660c1b4a08e3a2d28fd1e3394ab9e7abdde4a0f8"

 [[package]]
 name = "wasmparser"
-version = "0.51.4"
+version = "0.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aeb1956b19469d1c5e63e459d29e7b5aa0f558d9f16fcef09736f8a265e6c10a"
+checksum = "32fddd575d477c6e9702484139cf9f23dcd554b06d185ed0f56c857dd3a47aa6"

 [[package]]
 name = "wast"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -76,8 +76,8 @@ failure_derive = { git = "https://github.com/badboy/failure", rev = "64af847bc5f

 [patch.crates-io.cranelift-codegen]
 git = "https://github.com/bytecodealliance/wasmtime"
-rev = "b7cfd39b531680217537cfcf5294a22077a0a58d"
+rev = "e3d89c8a92a5fadedd75359b8485d23ac45ecf29"

 [patch.crates-io.cranelift-wasm]
 git = "https://github.com/bytecodealliance/wasmtime"
-rev = "b7cfd39b531680217537cfcf5294a22077a0a58d"
+rev = "e3d89c8a92a5fadedd75359b8485d23ac45ecf29"
--- a/js/src/wasm/cranelift/Cargo.toml
+++ b/js/src/wasm/cranelift/Cargo.toml
@ -13,8 +13,8 @@ name = "baldrdash"
 # cranelift-wasm to pinned commits. If you want to update Cranelift in Gecko,
 # you should update the following $TOP_LEVEL/Cargo.toml file: look for the
 # revision (rev) hashes of both cranelift dependencies (codegen and wasm).
-cranelift-codegen = { version = "0.63.0", default-features = false }
-cranelift-wasm = "0.63.0"
+cranelift-codegen = { version = "0.64.0", default-features = false }
+cranelift-wasm = "0.64.0"
 log = { version = "0.4.6", default-features = false, features = ["release_max_level_info"] }
 env_logger = "0.6"
 smallvec = "1.0"
--- a/third_party/rust/cranelift-bforest/.cargo-checksum.json
+++ b/third_party/rust/cranelift-bforest/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"07d7670bb6f0c26fa3abb5d547d645b8b6ab32378dba33e3453122c8ba59c6b5","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"af367c67340fa7f6fb9a35b0aa637dcf303957f7ae7427a5f4f6356801c8bb04","src/lib.rs":"23a5c42d477197a947122e662068e681bb9ed31041c0b668c3267c3fce15d39e","src/map.rs":"a3b7f64cae7ec9c2a8038def315bcf90e8751552b1bc1c20b62fbb8c763866c4","src/node.rs":"28f7edd979f7b9712bc4ab30b0d2a1b8ad5485a4b1e8c09f3dcaf501b9b5ccd1","src/path.rs":"a86ee1c882c173e8af96fd53a416a0fb485dd3f045ac590ef313a9d9ecf90f56","src/pool.rs":"f6337b5417f7772e6878a160c1a40629199ff09997bdff18eb2a0ba770158600","src/set.rs":"281eb8b5ead1ffd395946464d881f9bb0e7fb61092aed701d72d2314b5f80994"},"package":null}
+{"files":{"Cargo.toml":"fe108380fdfaac0d92a92302d0751df182b888e874e56e465f4241dbb670a92e","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"af367c67340fa7f6fb9a35b0aa637dcf303957f7ae7427a5f4f6356801c8bb04","src/lib.rs":"23a5c42d477197a947122e662068e681bb9ed31041c0b668c3267c3fce15d39e","src/map.rs":"a3b7f64cae7ec9c2a8038def315bcf90e8751552b1bc1c20b62fbb8c763866c4","src/node.rs":"28f7edd979f7b9712bc4ab30b0d2a1b8ad5485a4b1e8c09f3dcaf501b9b5ccd1","src/path.rs":"a86ee1c882c173e8af96fd53a416a0fb485dd3f045ac590ef313a9d9ecf90f56","src/pool.rs":"f6337b5417f7772e6878a160c1a40629199ff09997bdff18eb2a0ba770158600","src/set.rs":"281eb8b5ead1ffd395946464d881f9bb0e7fb61092aed701d72d2314b5f80994"},"package":null}
--- a/third_party/rust/cranelift-bforest/Cargo.toml
+++ b/third_party/rust/cranelift-bforest/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-bforest"
-version = "0.63.0"
+version = "0.64.0"
 description = "A forest of B+-trees"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-bforest"
@ -12,7 +12,7 @@ keywords = ["btree", "forest", "set", "map"]
 edition = "2018"

 [dependencies]
-cranelift-entity = { path = "../entity", version = "0.63.0", default-features = false }
+cranelift-entity = { path = "../entity", version = "0.64.0", default-features = false }

 [badges]
 maintenance = { status = "experimental" }
--- a/third_party/rust/cranelift-codegen-meta/.cargo-checksum.json
+++ b/third_party/rust/cranelift-codegen-meta/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"2d1fae4231bb7d3c43ebcaccbc62d243440ab537a5b6bd40c653ece0bcda5a75","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"b123f056d0d458396679c5f7f2a16d2762af0258fcda4ac14b6655a95e5a0022","src/cdsl/ast.rs":"84a4b7e3301e3249716958a7aa4ea5ba8c6172e3c02f57ee3880504c4433ff19","src/cdsl/cpu_modes.rs":"996e45b374cfe85ac47c8c86c4459fe4c04b3158102b4c63b6ee434d5eed6a9e","src/cdsl/encodings.rs":"d884a564815a03c23369bcf31d13b122ae5ba84d0c80eda9312f0c0a829bf794","src/cdsl/formats.rs":"63e638305aa3ca6dd409ddf0e5e9605eeac1cc2631103e42fc6cbc87703d9b63","src/cdsl/instructions.rs":"41e1a230501de3f0da3960d8aa375c8bcd60ec62ede94ad61806816acbd8009a","src/cdsl/isa.rs":"ccabd6848b69eb069c10db61c7e7f86080777495714bb53d03e663c40541be94","src/cdsl/mod.rs":"0aa827923bf4c45e5ee2359573bd863e00f474acd532739f49dcd74a27553882","src/cdsl/operands.rs":"1c3411504de9c83112ff48e0ff1cfbb2e4ba5a9a15c1716f411ef31a4df59899","src/cdsl/recipes.rs":"80b7cd87332229b569e38086ceee8d557e679b9a32ad2e50bdb15c33337c3418","src/cdsl/regs.rs":"466a42a43355fc7623fe5d8e8d330622207a3af6a80cb9367bc0f06e224c9ee0","src/cdsl/settings.rs":"e6fd9a31925743b93b11f09c9c8271bab6aa2430aa053a2601957b4487df7d77","src/cdsl/type_inference.rs":"1efca8a095ffc899b7527bda6b9d9378c73d7283f8dceaa4819e8af599f8be21","src/cdsl/types.rs":"ff764c9e9c29a05677bff6164e7bc25a0c32655052d77ae580536abba8b1713b","src/cdsl/typevar.rs":"371ac795added2cb464371443313eb55350c629c62ce8e62e192129b6c41d45e","src/cdsl/xform.rs":"55da0c3f2403147b535ab6ae5d69c623fbe839edecf2a3af1de84420cd58402d","src/default_map.rs":"101bb0282a124f9c921f6bd095f529e8753621450d783c3273b0b0394c2c5c03","src/error.rs":"e9b11b2feb2d867b94c8810fdc5a6c4e0d9131604a0bfa5340ff2639a55100b4","src/gen_binemit.rs":"515e243420b30d1e01f8ea630282d9b6d78a715e1951f3f20392e19a48164442","src/gen_encodings.rs":"f00cded6b68a9b48c9e3cd39a8b6f0ba136f4062c8f8666109158a72c62c3ed1","src/gen_inst.rs":"b275053977c0239211c1df35253154ba4dce2519f506088e71104de37d3db862","src/gen_legalizer.rs":"ea229ab9393cc5ba2242f626e74c624ea59314535e74b26602dafb8e96481a72","src/gen_registers.rs":"a904119ed803c9de24dedd15149a65337ffc168bb1d63df53d7fdebfb5f4b158","src/gen_settings.rs":"f3cc3d31f6cc898f30606caf084f0de220db2d3b1b5e5e4145fa7c9a9a1597e2","src/gen_types.rs":"f6c090e1646a43bf2fe81ae0a7029cc6f7dc6d43285368f56d86c35a21c469a6","src/isa/arm32/mod.rs":"da18cb40c1a0a6b613ddefcc38a5d01d02c95de6f233ebd4ad84fefb992c008b","src/isa/arm64/mod.rs":"3a815eaa478d82b7f8b536b83f9debb6b79ec860f99fea6485f209a836c6939a","src/isa/mod.rs":"136141f99f217ba42b9e3f7f47238ab19cc974bb3bef2e2df7f7b5a683989d46","src/isa/riscv/encodings.rs":"8abb1968d917588bc5fc5f5be6dd66bdec23ac456ba65f8138237c8e891e843c","src/isa/riscv/mod.rs":"a7b461a30bbfbc1e3b33645422ff40d5b1761c30cb5d4a8aa12e9a3b7f7aee51","src/isa/riscv/recipes.rs":"fd5a7418fa0d47cdf1b823b31553f1549c03e160ffffac9e22d611185774367e","src/isa/x86/encodings.rs":"a19e5dd7ba7fe74f2ec0a2367e61e2dab498113f8b2a2f1bc677b6ee486358d5","src/isa/x86/instructions.rs":"144e83591444115f2ab8d16777e322eb5c9d8eef123ad05d0c66811a029b662b","src/isa/x86/legalize.rs":"d2eb6cee5c885870250417f4d9086527c96f994542c9316baf14776b500e45b0","src/isa/x86/mod.rs":"65953f998ff3fc3b333167e9979fc0f15f976b51ad75272ac19dcaad0981b371","src/isa/x86/opcodes.rs":"44556abfc4a319a6e48aa878f10550b7878725ba0bf75ddc9bb6a0e6f4223c73","src/isa/x86/recipes.rs":"f142ae4ea1db29df0f3c9aedf0c5ee228682136526499f0c85aab101375d0c8c","src/isa/x86/registers.rs":"4be0a45d8acd465c31746b7976124025b06b453e3f6d587f93efb5af0e12b1a8","src/isa/x86/settings.rs":"49abb46533b3a5415cd033e0a98b5c9561e231f2dd9510d587dc69b204bb6706","src/lib.rs":"2491b0e74078914cb89d1778fa8174daf723fe76aaf7fed18741237d68f6df32","src/shared/entities.rs":"90f774a70e1c2a2e9a553c07a5e80e0fe54cf127434bd83e67274bba4e1a19ba","src/shared/formats.rs":"89ed4074f748637adf56b93ba952e398c45d43e6326d01676885939e3fe8bc4a","src/shared/immediates.rs":"e4a57657f6af9853794804eb41c01204a2c13a632f44f55d90e156a4b98c5f65","src/shared/instructions.rs":"8df3abeb47b52b7dc99f6e0bb16cf8a695ce4fe0a8d86035945a2612d1aa5a6d","src/shared/legalize.rs":"bc9c3292446c1d338df1c4ce19f3ac5482cfe582a04a5a1e82fc9aaa6aef25ea","src/shared/mod.rs":"c219625990bf15507ac1077b349ce20e5312d4e4707426183676d469e78792b7","src/shared/settings.rs":"9460758f04ccfc9129ea4d4081571fe4a3ac574c3d25b6473f888fbbb506b9d3","src/shared/types.rs":"4702df132f4b5d70cc9411ec5221ba0b1bd4479252274e0223ae57b6d0331247","src/srcgen.rs":"dcfc159c8599270f17e6a978c4be255abca51556b5ef0da497faec4a4a1e62ce","src/unique_table.rs":"31aa54330ca4786af772d32e8cb6158b6504b88fa93fe177bf0c6cbe545a8d35"},"package":null}
+{"files":{"Cargo.toml":"a19ba59829e25d67120787a454038986a6759f7d592dcf427924ebbcb5de6697","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"b123f056d0d458396679c5f7f2a16d2762af0258fcda4ac14b6655a95e5a0022","src/cdsl/ast.rs":"84a4b7e3301e3249716958a7aa4ea5ba8c6172e3c02f57ee3880504c4433ff19","src/cdsl/cpu_modes.rs":"996e45b374cfe85ac47c8c86c4459fe4c04b3158102b4c63b6ee434d5eed6a9e","src/cdsl/encodings.rs":"d884a564815a03c23369bcf31d13b122ae5ba84d0c80eda9312f0c0a829bf794","src/cdsl/formats.rs":"63e638305aa3ca6dd409ddf0e5e9605eeac1cc2631103e42fc6cbc87703d9b63","src/cdsl/instructions.rs":"41e1a230501de3f0da3960d8aa375c8bcd60ec62ede94ad61806816acbd8009a","src/cdsl/isa.rs":"ccabd6848b69eb069c10db61c7e7f86080777495714bb53d03e663c40541be94","src/cdsl/mod.rs":"0aa827923bf4c45e5ee2359573bd863e00f474acd532739f49dcd74a27553882","src/cdsl/operands.rs":"1c3411504de9c83112ff48e0ff1cfbb2e4ba5a9a15c1716f411ef31a4df59899","src/cdsl/recipes.rs":"80b7cd87332229b569e38086ceee8d557e679b9a32ad2e50bdb15c33337c3418","src/cdsl/regs.rs":"466a42a43355fc7623fe5d8e8d330622207a3af6a80cb9367bc0f06e224c9ee0","src/cdsl/settings.rs":"e6fd9a31925743b93b11f09c9c8271bab6aa2430aa053a2601957b4487df7d77","src/cdsl/type_inference.rs":"1efca8a095ffc899b7527bda6b9d9378c73d7283f8dceaa4819e8af599f8be21","src/cdsl/types.rs":"ff764c9e9c29a05677bff6164e7bc25a0c32655052d77ae580536abba8b1713b","src/cdsl/typevar.rs":"371ac795added2cb464371443313eb55350c629c62ce8e62e192129b6c41d45e","src/cdsl/xform.rs":"55da0c3f2403147b535ab6ae5d69c623fbe839edecf2a3af1de84420cd58402d","src/default_map.rs":"101bb0282a124f9c921f6bd095f529e8753621450d783c3273b0b0394c2c5c03","src/error.rs":"e9b11b2feb2d867b94c8810fdc5a6c4e0d9131604a0bfa5340ff2639a55100b4","src/gen_binemit.rs":"515e243420b30d1e01f8ea630282d9b6d78a715e1951f3f20392e19a48164442","src/gen_encodings.rs":"f00cded6b68a9b48c9e3cd39a8b6f0ba136f4062c8f8666109158a72c62c3ed1","src/gen_inst.rs":"88532d2e2c9724dde968d6b046927249c33d2037ab3e3fd1bd7ebfa77fe12bc7","src/gen_legalizer.rs":"ea229ab9393cc5ba2242f626e74c624ea59314535e74b26602dafb8e96481a72","src/gen_registers.rs":"a904119ed803c9de24dedd15149a65337ffc168bb1d63df53d7fdebfb5f4b158","src/gen_settings.rs":"f3cc3d31f6cc898f30606caf084f0de220db2d3b1b5e5e4145fa7c9a9a1597e2","src/gen_types.rs":"f6c090e1646a43bf2fe81ae0a7029cc6f7dc6d43285368f56d86c35a21c469a6","src/isa/arm32/mod.rs":"da18cb40c1a0a6b613ddefcc38a5d01d02c95de6f233ebd4ad84fefb992c008b","src/isa/arm64/mod.rs":"3a815eaa478d82b7f8b536b83f9debb6b79ec860f99fea6485f209a836c6939a","src/isa/mod.rs":"136141f99f217ba42b9e3f7f47238ab19cc974bb3bef2e2df7f7b5a683989d46","src/isa/riscv/encodings.rs":"8abb1968d917588bc5fc5f5be6dd66bdec23ac456ba65f8138237c8e891e843c","src/isa/riscv/mod.rs":"a7b461a30bbfbc1e3b33645422ff40d5b1761c30cb5d4a8aa12e9a3b7f7aee51","src/isa/riscv/recipes.rs":"5be3bf7c9ba3c51ece384b7eee75a8f7fa0cbacc6a5babc9d0e1d92a2e54a4c2","src/isa/x86/encodings.rs":"87c70a4856bb1c40ba6babed549aa7e01478375244dea605be0334ae6d0441e0","src/isa/x86/instructions.rs":"a2c81ff80e30980fe444aa1e56ba57c54911cee67c392c16bfbdf28f75151dc6","src/isa/x86/legalize.rs":"b5f68ea089c4237c7140ef0b8ff71f7c6a5f53884bf2158d81b52d3750bcacac","src/isa/x86/mod.rs":"ecc1d4de51bd44dbaa864fafebb68f66bc99fb8c9ad67a0fcb420bd1f87d1524","src/isa/x86/opcodes.rs":"f98dd104910efbfa3c211080c68a17da607ce585b9d81bf22cb255e58e51f99f","src/isa/x86/recipes.rs":"b71a3746ed39b08932dc1a0ce885b61eec2e8daf2e92d12eccc0d085e4587a1f","src/isa/x86/registers.rs":"4be0a45d8acd465c31746b7976124025b06b453e3f6d587f93efb5af0e12b1a8","src/isa/x86/settings.rs":"69623c2193458c838617e52e88d3ff91b71f3f07aec1f1494c0cabd7c332ad49","src/lib.rs":"2491b0e74078914cb89d1778fa8174daf723fe76aaf7fed18741237d68f6df32","src/shared/entities.rs":"90f774a70e1c2a2e9a553c07a5e80e0fe54cf127434bd83e67274bba4e1a19ba","src/shared/formats.rs":"2f8cbb008778a49b60efac4647dffef654d225823e03ca6272af2678666dc423","src/shared/immediates.rs":"e4a57657f6af9853794804eb41c01204a2c13a632f44f55d90e156a4b98c5f65","src/shared/instructions.rs":"38b9a3b09bd86d020b841abe94eef003063b2cb12d9dc991a7743b2cc0bb3362","src/shared/legalize.rs":"55b186e09383cc16491a6a0dd79aa9149c1aba1927a7173701478818b8116795","src/shared/mod.rs":"c219625990bf15507ac1077b349ce20e5312d4e4707426183676d469e78792b7","src/shared/settings.rs":"0b4f903de5f2df19304c44bf4bd456c3a8e165103b38ccb13b6f88ae8a3c7ee8","src/shared/types.rs":"4702df132f4b5d70cc9411ec5221ba0b1bd4479252274e0223ae57b6d0331247","src/srcgen.rs":"dcfc159c8599270f17e6a978c4be255abca51556b5ef0da497faec4a4a1e62ce","src/unique_table.rs":"31aa54330ca4786af772d32e8cb6158b6504b88fa93fe177bf0c6cbe545a8d35"},"package":null}
--- a/third_party/rust/cranelift-codegen-meta/Cargo.toml
+++ b/third_party/rust/cranelift-codegen-meta/Cargo.toml
@ -1,19 +1,19 @@
 [package]
 name = "cranelift-codegen-meta"
 authors = ["The Cranelift Project Developers"]
-version = "0.63.0"
+version = "0.64.0"
 description = "Metaprogram for cranelift-codegen code generator library"
 license = "Apache-2.0 WITH LLVM-exception"
 repository = "https://github.com/bytecodealliance/wasmtime"
 readme = "README.md"
 edition = "2018"

+[package.metadata.docs.rs]
+rustdoc-args = [ "--document-private-items" ]
+
 [dependencies]
-cranelift-codegen-shared = { path = "../shared", version = "0.63.0" }
-cranelift-entity = { path = "../../entity", version = "0.63.0" }
+cranelift-codegen-shared = { path = "../shared", version = "0.64.0" }
+cranelift-entity = { path = "../../entity", version = "0.64.0" }

 [badges]
 maintenance = { status = "experimental" }
-
-[package.metadata.docs.rs]
-rustdoc-args = [ "--document-private-items" ]
--- a/third_party/rust/cranelift-codegen-meta/src/gen_inst.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/gen_inst.rs
@ -874,17 +874,32 @@ fn gen_format_constructor(format: &InstructionFormat, fmt: &mut Formatter) {
        args.join(", ")
    );

+    let imms_need_sign_extension = format
+        .imm_fields
+        .iter()
+        .any(|f| f.kind.rust_type == "ir::immediates::Imm64");
+
    fmt.doc_comment(format.to_string());
    fmt.line("#[allow(non_snake_case)]");
    fmtln!(fmt, "fn {} {{", proto);
    fmt.indent(|fmt| {
        // Generate the instruction data.
-        fmtln!(fmt, "let data = ir::InstructionData::{} {{", format.name);
+        fmtln!(
+            fmt,
+            "let{} data = ir::InstructionData::{} {{",
+            if imms_need_sign_extension { " mut" } else { "" },
+            format.name
+        );
        fmt.indent(|fmt| {
            fmt.line("opcode,");
            gen_member_inits(format, fmt);
        });
        fmtln!(fmt, "};");
+
+        if imms_need_sign_extension {
+            fmtln!(fmt, "data.sign_extend_immediates(ctrl_typevar);");
+        }
+
        fmt.line("self.build(data, ctrl_typevar)");
    });
    fmtln!(fmt, "}");
--- a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs
@ -64,7 +64,7 @@ pub(crate) fn define(shared_defs: &SharedDefinitions, regs: &IsaRegs) -> RecipeG

    // R-type with an immediate shift amount instead of rs2.
    recipes.push(
-        EncodingRecipeBuilder::new("Rshamt", &formats.binary_imm, 4)
+        EncodingRecipeBuilder::new("Rshamt", &formats.binary_imm64, 4)
            .operands_in(vec![gpr])
            .operands_out(vec![gpr])
            .emit("put_rshamt(bits, in_reg0, imm.into(), out_reg0, sink);"),
@ -79,11 +79,11 @@ pub(crate) fn define(shared_defs: &SharedDefinitions, regs: &IsaRegs) -> RecipeG
    );

    recipes.push(
-        EncodingRecipeBuilder::new("Ii", &formats.binary_imm, 4)
+        EncodingRecipeBuilder::new("Ii", &formats.binary_imm64, 4)
            .operands_in(vec![gpr])
            .operands_out(vec![gpr])
            .inst_predicate(InstructionPredicate::new_is_signed_int(
-                &*formats.binary_imm,
+                &*formats.binary_imm64,
                "imm",
                12,
                0,
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs
@ -689,6 +689,12 @@ fn define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r:
            }
        }
    }
+    for (to, from) in &[(I16, B16), (I32, B32), (I64, B64)] {
+        e.enc_both(
+            bint.bind(*to).bind(*from),
+            rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+        );
+    }

    // Copy Special
    // For x86-64, only define REX forms for now, since we can't describe the
@ -1448,6 +1454,7 @@ fn define_alu(
    // x86 has a bitwise not instruction NOT.
    e.enc_i32_i64(bnot, rec_ur.opcodes(&NOT).rrr(2));
    e.enc_b32_b64(bnot, rec_ur.opcodes(&NOT).rrr(2));
+    e.enc_both(bnot.bind(B1), rec_ur.opcodes(&NOT).rrr(2));

    // Also add a `b1` encodings for the logic instructions.
    // TODO: Should this be done with 8-bit instructions? It would improve partial register
@ -1487,8 +1494,13 @@ fn define_alu(
    for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
        // Cannot use enc_i32_i64 for this pattern because instructions require
        // to bind any.
+        e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
        e.enc32(
-            inst.bind(I32).bind(Any),
+            inst.bind(I32).bind(I16),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
+        );
+        e.enc32(
+            inst.bind(I32).bind(I32),
            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
        );
        e.enc64(
@ -1601,8 +1613,11 @@ fn define_simd(
    let sadd_sat = shared.by_name("sadd_sat");
    let scalar_to_vector = shared.by_name("scalar_to_vector");
    let sload8x8 = shared.by_name("sload8x8");
+    let sload8x8_complex = shared.by_name("sload8x8_complex");
    let sload16x4 = shared.by_name("sload16x4");
+    let sload16x4_complex = shared.by_name("sload16x4_complex");
    let sload32x2 = shared.by_name("sload32x2");
+    let sload32x2_complex = shared.by_name("sload32x2_complex");
    let spill = shared.by_name("spill");
    let sqrt = shared.by_name("sqrt");
    let sshr_imm = shared.by_name("sshr_imm");
@ -1611,11 +1626,15 @@ fn define_simd(
    let store_complex = shared.by_name("store_complex");
    let uadd_sat = shared.by_name("uadd_sat");
    let uload8x8 = shared.by_name("uload8x8");
+    let uload8x8_complex = shared.by_name("uload8x8_complex");
    let uload16x4 = shared.by_name("uload16x4");
+    let uload16x4_complex = shared.by_name("uload16x4_complex");
    let uload32x2 = shared.by_name("uload32x2");
+    let uload32x2_complex = shared.by_name("uload32x2_complex");
    let ushr_imm = shared.by_name("ushr_imm");
    let usub_sat = shared.by_name("usub_sat");
    let vconst = shared.by_name("vconst");
+    let vselect = shared.by_name("vselect");
    let x86_insertps = x86.by_name("x86_insertps");
    let x86_movlhps = x86.by_name("x86_movlhps");
    let x86_movsd = x86.by_name("x86_movsd");
@ -1626,6 +1645,8 @@ fn define_simd(
    let x86_pmaxu = x86.by_name("x86_pmaxu");
    let x86_pmins = x86.by_name("x86_pmins");
    let x86_pminu = x86.by_name("x86_pminu");
+    let x86_pmullq = x86.by_name("x86_pmullq");
+    let x86_pmuludq = x86.by_name("x86_pmuludq");
    let x86_pshufb = x86.by_name("x86_pshufb");
    let x86_pshufd = x86.by_name("x86_pshufd");
    let x86_psll = x86.by_name("x86_psll");
@ -1636,6 +1657,7 @@ fn define_simd(
    let x86_punpckl = x86.by_name("x86_punpckl");

    // Shorthands for recipes.
+    let rec_blend = r.template("blend");
    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
    let rec_f_ib = r.template("f_ib");
    let rec_fa = r.template("fa");
@ -1705,6 +1727,20 @@ fn define_simd(
        e.enc_both_inferred(instruction, template);
    }

+    // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
+    // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
+    // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let opcode = match ty.lane_bits() {
+            32 => &BLENDVPS,
+            64 => &BLENDVPD,
+            _ => &PBLENDVB,
+        };
+        let instruction = vselect.bind(vector(ty, sse_vector_size));
+        let template = rec_blend.opcodes(opcode);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
    // written to the low doubleword of the register and the register is zero-extended to 128 bits."
@ -1977,6 +2013,35 @@ fn define_simd(
        }
    }

+    // SIMD load extend (complex addressing)
+    let is_load_complex_length_two =
+        InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
+    for (inst, opcodes) in &[
+        (uload8x8_complex, &PMOVZXBW),
+        (uload16x4_complex, &PMOVZXWD),
+        (uload32x2_complex, &PMOVZXDQ),
+        (sload8x8_complex, &PMOVSXBW),
+        (sload16x4_complex, &PMOVSXWD),
+        (sload32x2_complex, &PMOVSXDQ),
+    ] {
+        for recipe in &[
+            rec_fldWithIndex,
+            rec_fldWithIndexDisp8,
+            rec_fldWithIndexDisp32,
+        ] {
+            let template = recipe.opcodes(*opcodes);
+            let predicate = |encoding: EncodingBuilder| {
+                encoding
+                    .isa_predicate(use_sse41_simd)
+                    .inst_predicate(is_load_complex_length_two.clone())
+            };
+            e.enc32_func(inst.clone(), template.clone(), predicate);
+            // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64.
+            e.enc64_func(inst.clone(), template.rex(), predicate);
+            e.enc64_func(inst.clone(), template, predicate);
+        }
+    }
+
    // SIMD integer addition
    for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
        let iadd = iadd.bind(vector(*ty, sse_vector_size));
@ -2036,12 +2101,14 @@ fn define_simd(
        e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
    }

+    // SIMD multiplication with lane expansion.
+    e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
+
    // SIMD integer multiplication for I64x2 using a AVX512.
    {
-        let imul = imul.bind(vector(I64, sse_vector_size));
        e.enc_32_64_maybe_isap(
-            imul,
-            rec_evex_reg_vvvv_rm_128.opcodes(&PMULLQ).w(),
+            x86_pmullq,
+            rec_evex_reg_vvvv_rm_128.opcodes(&VPMULLQ).w(),
            Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
        );
    }
@ -2117,8 +2184,11 @@ fn define_simd(
        let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size));
        e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2));

-        let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size));
-        e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4));
+        // One exception: PSRAQ does not exist in for 64x2 in SSE2, it requires a higher CPU feature set.
+        if *ty != I64 {
+            let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size));
+            e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4));
+        }
    }

    // SIMD integer comparisons
@ -2223,8 +2293,7 @@ fn define_entity_ref(
    let rec_gvaddr8 = r.template("gvaddr8");
    let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
    let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
-    let rec_spaddr4_id = r.template("spaddr4_id");
-    let rec_spaddr8_id = r.template("spaddr8_id");
+    let rec_spaddr_id = r.template("spaddr_id");

    // Predicates shorthands.
    let all_ones_funcaddrs_and_not_is_pic =
@ -2312,8 +2381,8 @@ fn define_entity_ref(
    //
    // TODO: Add encoding rules for stack_load and stack_store, so that they
    // don't get legalized to stack_addr + load/store.
-    e.enc32(stack_addr.bind(I32), rec_spaddr4_id.opcodes(&LEA));
-    e.enc64(stack_addr.bind(I64), rec_spaddr8_id.opcodes(&LEA).rex().w());
+    e.enc64(stack_addr.bind(I64), rec_spaddr_id.opcodes(&LEA).rex().w());
+    e.enc32(stack_addr.bind(I32), rec_spaddr_id.opcodes(&LEA));

    // Constant addresses (PIC).
    e.enc64(const_addr.bind(I64), rec_const_addr.opcodes(&LEA).rex().w());
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs
@ -283,7 +283,7 @@ pub(crate) fn define(
    Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended
    register and re-orders the data according to the passed immediate byte.
    "#,
-            &formats.extract_lane,
+            &formats.binary_imm8,
        )
        .operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN)
        .operands_out(vec![a]),
@ -314,7 +314,7 @@ pub(crate) fn define(
        The lane index, ``Idx``, is an immediate value, not an SSA value. It
        must indicate a valid lane index for the type of ``x``.
        "#,
-            &formats.extract_lane,
+            &formats.binary_imm8,
        )
        .operands_in(vec![x, Idx])
        .operands_out(vec![a]),
@ -342,9 +342,9 @@ pub(crate) fn define(
        The lane index, ``Idx``, is an immediate value, not an SSA value. It
        must indicate a valid lane index for the type of ``x``.
        "#,
-            &formats.insert_lane,
+            &formats.ternary_imm8,
        )
-        .operands_in(vec![x, Idx, y])
+        .operands_in(vec![x, y, Idx])
        .operands_out(vec![a]),
    );

@ -369,9 +369,9 @@ pub(crate) fn define(
        extracted from and which it is inserted to. This is similar to x86_pinsr but inserts
        floats, which are already stored in an XMM register.
        "#,
-            &formats.insert_lane,
+            &formats.ternary_imm8,
        )
-        .operands_in(vec![x, Idx, y])
+        .operands_in(vec![x, y, Idx])
        .operands_out(vec![a]),
    );

@ -475,10 +475,11 @@ pub(crate) fn define(
            .includes_scalars(false)
            .build(),
    );
-    let I64x2 = &TypeVar::new(
-        "I64x2",
-        "A SIMD vector type containing one large integer (the upper lane is concatenated with \
-         the lower lane to form the integer)",
+    let I128 = &TypeVar::new(
+        "I128",
+        "A SIMD vector type containing one large integer (due to Cranelift type constraints, \
+        this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \
+        upper lane is concatenated with the lower lane to form the integer)",
        TypeSetBuilder::new()
            .ints(64..64)
            .simd_lanes(2..2)
@ -487,7 +488,7 @@ pub(crate) fn define(
    );

    let x = &Operand::new("x", IxN).with_doc("Vector value to shift");
-    let y = &Operand::new("y", I64x2).with_doc("Number of bits to shift");
+    let y = &Operand::new("y", I128).with_doc("Number of bits to shift");
    let a = &Operand::new("a", IxN);

    ig.push(
@ -532,6 +533,47 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let I64x2 = &TypeVar::new(
+        "I64x2",
+        "A SIMD vector type containing two 64-bit integers",
+        TypeSetBuilder::new()
+            .ints(64..64)
+            .simd_lanes(2..2)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I64x2);
+    let y = &Operand::new("y", I64x2);
+    let a = &Operand::new("a", I64x2);
+    ig.push(
+        Inst::new(
+            "x86_pmullq",
+            r#"
+        Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with
+        lane-wise wrapping if the result overflows. This instruction is necessary to add distinct
+        encodings for CPUs with newer vector features.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pmuludq",
+            r#"
+        Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2
+        unsigned integers and receive a 64x2 result. This instruction avoids the need for handling
+        overflow as in `x86_pmullq`.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", TxN);
    let y = &Operand::new("y", TxN);
    let f = &Operand::new("f", iflags);
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs
@ -8,7 +8,7 @@ use crate::shared::Definitions as SharedDefinitions;

 #[allow(clippy::many_single_char_names)]
 pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
-    let mut group = TransformGroupBuilder::new(
+    let mut expand = TransformGroupBuilder::new(
        "x86_expand",
        r#"
    Legalize instructions by expansion.
@ -18,6 +18,37 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    .isa("x86")
    .chain_with(shared.transform_groups.by_name("expand_flags").id);

+    let mut narrow = TransformGroupBuilder::new(
+        "x86_narrow",
+        r#"
+    Legalize instructions by narrowing.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("narrow_flags").id);
+
+    let mut narrow_avx = TransformGroupBuilder::new(
+        "x86_narrow_avx",
+        r#"
+    Legalize instructions by narrowing with CPU feature checks.
+
+    This special case converts using x86 AVX instructions where available."#,
+    )
+    .isa("x86");
+    // We cannot chain with the x86_narrow group until this group is built, see bottom of this
+    // function for where this is chained.
+
+    let mut widen = TransformGroupBuilder::new(
+        "x86_widen",
+        r#"
+    Legalize instructions by widening.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("widen").id);
+
    // List of instructions.
    let insts = &shared.instructions;
    let band = insts.by_name("band");
@ -37,6 +68,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let imul = insts.by_name("imul");
    let ineg = insts.by_name("ineg");
    let isub = insts.by_name("isub");
+    let ishl = insts.by_name("ishl");
+    let ireduce = insts.by_name("ireduce");
    let popcnt = insts.by_name("popcnt");
    let sdiv = insts.by_name("sdiv");
    let selectif = insts.by_name("selectif");
@ -45,6 +78,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let tls_value = insts.by_name("tls_value");
    let udiv = insts.by_name("udiv");
    let umulhi = insts.by_name("umulhi");
+    let ushr = insts.by_name("ushr");
    let ushr_imm = insts.by_name("ushr_imm");
    let urem = insts.by_name("urem");

@ -55,14 +89,40 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct

    let imm = &shared.imm;

+    // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
+    // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
+    // not encodable.
+    let a = var("a");
+    let x = var("x");
+    let y = var("y");
+    let z = var("z");
+
+    for &ty in &[I8, I16, I32] {
+        let ishl_by_i64 = ishl.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        expand.legalize(
+            def!(a = ishl_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
+    for &ty in &[I8, I16, I32] {
+        let ushr_by_i64 = ushr.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        expand.legalize(
+            def!(a = ushr_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
    // Division and remainder.
    //
    // The srem expansion requires custom code because srem INT_MIN, -1 is not
    // allowed to trap. The other ops need to check avoid_div_traps.
-    group.custom_legalize(sdiv, "expand_sdivrem");
-    group.custom_legalize(srem, "expand_sdivrem");
-    group.custom_legalize(udiv, "expand_udivrem");
-    group.custom_legalize(urem, "expand_udivrem");
+    expand.custom_legalize(sdiv, "expand_sdivrem");
+    expand.custom_legalize(srem, "expand_sdivrem");
+    expand.custom_legalize(udiv, "expand_udivrem");
+    expand.custom_legalize(urem, "expand_udivrem");

    // Double length (widening) multiplication.
    let a = var("a");
@ -73,12 +133,12 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let res_lo = var("res_lo");
    let res_hi = var("res_hi");

-    group.legalize(
+    expand.legalize(
        def!(res_hi = umulhi(x, y)),
        vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
    );

-    group.legalize(
+    expand.legalize(
        def!(res_hi = smulhi(x, y)),
        vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
    );
@ -97,7 +157,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");

    // Equality needs an explicit `ord` test which checks the parity bit.
-    group.legalize(
+    expand.legalize(
        def!(a = fcmp(floatcc_eq, x, y)),
        vec![
            def!(a1 = fcmp(floatcc_ord, x, y)),
@ -105,7 +165,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
            def!(a = band(a1, a2)),
        ],
    );
-    group.legalize(
+    expand.legalize(
        def!(a = fcmp(floatcc_ne, x, y)),
        vec![
            def!(a1 = fcmp(floatcc_uno, x, y)),
@ -130,20 +190,20 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
        (floatcc_ugt, floatcc_ult),
        (floatcc_uge, floatcc_ule),
    ] {
-        group.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
+        expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
    }

    // We need to modify the CFG for min/max legalization.
-    group.custom_legalize(fmin, "expand_minmax");
-    group.custom_legalize(fmax, "expand_minmax");
+    expand.custom_legalize(fmin, "expand_minmax");
+    expand.custom_legalize(fmax, "expand_minmax");

    // Conversions from unsigned need special handling.
-    group.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
+    expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
    // Conversions from float to int can trap and modify the control flow graph.
-    group.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
-    group.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
-    group.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
-    group.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
+    expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
+    expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
+    expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
+    expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");

    // Count leading and trailing zeroes, for baseline x86_64
    let c_minus_one = var("c_minus_one");
@ -158,7 +218,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
    let imm64_minus_one = Literal::constant(&imm.imm64, -1);
    let imm64_63 = Literal::constant(&imm.imm64, 63);
-    group.legalize(
+    expand.legalize(
        def!(a = clz.I64(x)),
        vec![
            def!(c_minus_one = iconst(imm64_minus_one)),
@ -170,7 +230,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    );

    let imm64_31 = Literal::constant(&imm.imm64, 31);
-    group.legalize(
+    expand.legalize(
        def!(a = clz.I32(x)),
        vec![
            def!(c_minus_one = iconst(imm64_minus_one)),
@ -182,7 +242,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    );

    let imm64_64 = Literal::constant(&imm.imm64, 64);
-    group.legalize(
+    expand.legalize(
        def!(a = ctz.I64(x)),
        vec![
            def!(c_sixty_four = iconst(imm64_64)),
@ -192,7 +252,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    );

    let imm64_32 = Literal::constant(&imm.imm64, 32);
-    group.legalize(
+    expand.legalize(
        def!(a = ctz.I32(x)),
        vec![
            def!(c_thirty_two = iconst(imm64_32)),
@ -225,7 +285,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct

    let imm64_1 = Literal::constant(&imm.imm64, 1);
    let imm64_4 = Literal::constant(&imm.imm64, 4);
-    group.legalize(
+    expand.legalize(
        def!(r = popcnt.I64(x)),
        vec![
            def!(qv3 = ushr_imm(x, imm64_1)),
@ -266,7 +326,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let lc0F = var("lc0F");
    let lc01 = var("lc01");

-    group.legalize(
+    expand.legalize(
        def!(r = popcnt.I32(x)),
        vec![
            def!(lv3 = ushr_imm(x, imm64_1)),
@ -289,31 +349,27 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
        ],
    );

-    group.custom_legalize(ineg, "convert_ineg");
-
-    group.custom_legalize(tls_value, "expand_tls_value");
-
-    group.build_and_add_to(&mut shared.transform_groups);
-
-    let mut widen = TransformGroupBuilder::new(
-        "x86_widen",
-        r#"
-    Legalize instructions by widening.
-
-    Use x86-specific instructions if needed."#,
-    )
-    .isa("x86")
-    .chain_with(shared.transform_groups.by_name("widen").id);
-
+    expand.custom_legalize(ineg, "convert_ineg");
+    expand.custom_legalize(tls_value, "expand_tls_value");
    widen.custom_legalize(ineg, "convert_ineg");
-    widen.build_and_add_to(&mut shared.transform_groups);

-    // To reduce compilation times, separate out large blocks of legalizations by
-    // theme.
-    define_simd(shared, x86_instructions);
+    // To reduce compilation times, separate out large blocks of legalizations by theme.
+    define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
+
+    expand.build_and_add_to(&mut shared.transform_groups);
+    let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
+    narrow_avx
+        .chain_with(narrow_id)
+        .build_and_add_to(&mut shared.transform_groups);
+    widen.build_and_add_to(&mut shared.transform_groups);
 }

-fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
+fn define_simd(
+    shared: &mut SharedDefinitions,
+    x86_instructions: &InstructionGroup,
+    narrow: &mut TransformGroupBuilder,
+    narrow_avx: &mut TransformGroupBuilder,
+) {
    let insts = &shared.instructions;
    let band = insts.by_name("band");
    let band_not = insts.by_name("band_not");
@ -330,6 +386,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let icmp = insts.by_name("icmp");
    let imax = insts.by_name("imax");
    let imin = insts.by_name("imin");
+    let imul = insts.by_name("imul");
    let ineg = insts.by_name("ineg");
    let insertlane = insts.by_name("insertlane");
    let ishl = insts.by_name("ishl");
@ -349,6 +406,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let vconst = insts.by_name("vconst");
    let vall_true = insts.by_name("vall_true");
    let vany_true = insts.by_name("vany_true");
+    let vselect = insts.by_name("vselect");

    let x86_packss = x86_instructions.by_name("x86_packss");
    let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
@ -364,16 +422,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro

    let imm = &shared.imm;

-    let mut narrow = TransformGroupBuilder::new(
-        "x86_narrow",
-        r#"
-    Legalize instructions by narrowing.
-
-    Use x86-specific instructions if needed."#,
-    )
-    .isa("x86")
-    .chain_with(shared.transform_groups.by_name("narrow_flags").id);
-
    // Set up variables and immediates.
    let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
    let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
@ -430,7 +478,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
                // Move into the lowest 16 bits of an XMM register.
                def!(a = scalar_to_vector(x)),
                // Insert the value again but in the next lowest 16 bits.
-                def!(b = insertlane(a, uimm8_one, x)),
+                def!(b = insertlane(a, x, uimm8_one)),
                // No instruction emitted; pretend this is an I32x4 so we can use PSHUFD.
                def!(c = raw_bitcast_any16x8_to_i32x4(b)),
                // Broadcast the bytes in the XMM register with PSHUFD.
@ -464,7 +512,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
                // Move into the lowest 64 bits of an XMM register.
                def!(a = scalar_to_vector(x)),
                // Move into the highest 64 bits of the same XMM register.
-                def!(y = insertlane(a, uimm8_one, x)),
+                def!(y = insertlane(a, x, uimm8_one)),
            ],
        );
    }
@ -493,8 +541,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
        );
    }

-    // SIMD shift right (arithmetic)
-    for ty in &[I16, I32, I64] {
+    // SIMD shift right (arithmetic, i16x8 and i32x4)
+    for ty in &[I16, I32] {
        let sshr = sshr.bind(vector(*ty, sse_vector_size));
        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
        narrow.legalize(
@ -502,6 +550,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
            vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
        );
    }
+    // SIMD shift right (arithmetic, i8x16)
    {
        let sshr = sshr.bind(vector(I8, sse_vector_size));
        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
@ -526,6 +575,25 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
            ],
        );
    }
+    // SIMD shift right (arithmetic, i64x2)
+    {
+        let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
+        let sshr_scalar_lane0 = sshr.bind(I64);
+        let sshr_scalar_lane1 = sshr.bind(I64);
+        narrow.legalize(
+            def!(z = sshr_vector(x, y)),
+            vec![
+                // Use scalar operations to shift the first lane.
+                def!(a = extractlane(x, uimm8_zero)),
+                def!(b = sshr_scalar_lane0(a, y)),
+                def!(c = insertlane(x, b, uimm8_zero)),
+                // Do the same for the second lane.
+                def!(d = extractlane(x, uimm8_one)),
+                def!(e = sshr_scalar_lane1(d, y)),
+                def!(z = insertlane(c, e, uimm8_one)),
+            ],
+        );
+    }

    // SIMD select
    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
@ -540,6 +608,17 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
        );
    }

+    // SIMD vselect; replace with bitselect if BLEND* instructions are not available.
+    // This works, because each lane of boolean vector is filled with zeroes or ones.
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let vselect = vselect.bind(vector(ty, sse_vector_size));
+        let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(d = vselect(c, x, y)),
+            vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
+        );
+    }
+
    // SIMD vany_true
    let ne = Literal::enumerator_for(&imm.intcc, "ne");
    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
@ -709,5 +788,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    narrow.custom_legalize(ushr, "convert_ushr");
    narrow.custom_legalize(ishl, "convert_ishl");

-    narrow.build_and_add_to(&mut shared.transform_groups);
+    // This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
+    narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
 }
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs
@ -1,6 +1,6 @@
 use crate::cdsl::cpu_modes::CpuMode;
 use crate::cdsl::isa::TargetIsa;
-use crate::cdsl::types::ReferenceType;
+use crate::cdsl::types::{ReferenceType, VectorType};

 use crate::shared::types::Bool::B1;
 use crate::shared::types::Float::{F32, F64};
@ -35,6 +35,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
    let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
    let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
+    let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx");
    let x86_expand = shared_defs.transform_groups.by_name("x86_expand");

    x86_32.legalize_monomorphic(expand_flags);
@ -46,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
    x86_32.legalize_type(F32, x86_expand);
    x86_32.legalize_type(F64, x86_expand);
+    x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);

    x86_64.legalize_monomorphic(expand_flags);
    x86_64.legalize_default(x86_narrow);
@ -57,6 +59,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
    x86_64.legalize_type(F32, x86_expand);
    x86_64.legalize_type(F64, x86_expand);
+    x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);

    let recipes = recipes::define(shared_defs, &settings, &regs);

--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs
@ -54,6 +54,14 @@ pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
 /// Bit scan reverse (stores index of first encountered 1 from the back).
 pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];

+/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
+/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
+pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
+
+/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
+/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
+pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
+
 /// Call near, relative, displacement relative to next instruction (sign-extended).
 pub static CALL_RELATIVE: [u8; 1] = [0xe8];

@ -335,6 +343,10 @@ pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
 /// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
 pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];

+/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
+/// in XMM0 and store the values into xmm1 (SSE4.1).
+pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
+
 /// Compare packed data for equal (SSE2).
 pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];

@ -459,7 +471,11 @@ pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];

 /// Multiply the packed quadword signed integers in xmm2 and xmm3/m128 and store the low 64
 /// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding.
-pub static PMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
+pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
+
+/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers
+/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
+pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];

 /// Pop top of stack into r{16,32,64}; increment stack pointer.
 pub static POP_REG: [u8; 1] = [0x58];
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs
@ -427,6 +427,7 @@ pub(crate) fn define<'shared>(
    let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
    let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
    let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
+    let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0"));

    // Stack operand with a 32-bit signed displacement from either RBP or RSP.
    let stack_gpr32 = Stack::new(gpr);
@ -607,12 +608,12 @@ pub(crate) fn define<'shared>(
    // XX /r with FPR ins and outs. A form with a byte immediate.
    {
        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("fa_ib", &formats.insert_lane, 2)
+            EncodingRecipeBuilder::new("fa_ib", &formats.ternary_imm8, 2)
                .operands_in(vec![fpr, fpr])
                .operands_out(vec![0])
                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
-                    &*formats.insert_lane,
-                    "lane",
+                    &*formats.ternary_imm8,
+                    "imm",
                    8,
                    0,
                ))
@ -620,7 +621,7 @@ pub(crate) fn define<'shared>(
                    r#"
                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
                    modrm_rr(in_reg1, in_reg0, sink);
-                    let imm:i64 = lane.into();
+                    let imm: i64 = imm.into();
                    sink.put1(imm as u8);
                "#,
                ),
@ -904,14 +905,32 @@ pub(crate) fn define<'shared>(
        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
    );

+    // XX /r for BLEND* instructions
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("blend", &formats.ternary, 1)
+            .operands_in(vec![
+                OperandConstraint::FixedReg(reg_xmm0),
+                OperandConstraint::RegClass(fpr),
+                OperandConstraint::RegClass(fpr),
+            ])
+            .operands_out(vec![2])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink);
+                    modrm_rr(in_reg1, in_reg2, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg1_inreg2",
+    );
+
    // XX /n ib with 8-bit immediate sign-extended.
    {
        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("r_ib", &formats.binary_imm, 2)
+            EncodingRecipeBuilder::new("r_ib", &formats.binary_imm64, 2)
                .operands_in(vec![gpr])
                .operands_out(vec![0])
                .inst_predicate(InstructionPredicate::new_is_signed_int(
-                    &*formats.binary_imm,
+                    &*formats.binary_imm64,
                    "imm",
                    8,
                    0,
@ -928,11 +947,11 @@ pub(crate) fn define<'shared>(
        );

        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("f_ib", &formats.binary_imm, 2)
+            EncodingRecipeBuilder::new("f_ib", &formats.binary_imm64, 2)
                .operands_in(vec![fpr])
                .operands_out(vec![0])
                .inst_predicate(InstructionPredicate::new_is_signed_int(
-                    &*formats.binary_imm,
+                    &*formats.binary_imm64,
                    "imm",
                    8,
                    0,
@ -951,11 +970,11 @@ pub(crate) fn define<'shared>(
        // XX /n id with 32-bit immediate sign-extended.
        recipes.add_template(
            Template::new(
-                EncodingRecipeBuilder::new("r_id", &formats.binary_imm, 5)
+                EncodingRecipeBuilder::new("r_id", &formats.binary_imm64, 5)
                    .operands_in(vec![gpr])
                    .operands_out(vec![0])
                    .inst_predicate(InstructionPredicate::new_is_signed_int(
-                        &*formats.binary_imm,
+                        &*formats.binary_imm64,
                        "imm",
                        32,
                        0,
@ -977,20 +996,20 @@ pub(crate) fn define<'shared>(
    // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd)
    {
        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("r_ib_unsigned_fpr", &formats.extract_lane, 2)
+            EncodingRecipeBuilder::new("r_ib_unsigned_fpr", &formats.binary_imm8, 2)
                .operands_in(vec![fpr])
                .operands_out(vec![fpr])
                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
-                    &*formats.extract_lane,
-                    "lane",
+                    &*formats.binary_imm8,
+                    "imm",
                    8,
                    0,
-                )) // TODO if the format name is changed then "lane" should be renamed to something more appropriate--ordering mask? broadcast immediate?
+                ))
                .emit(
                    r#"
                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
                    modrm_rr(in_reg0, out_reg0, sink);
-                    let imm:i64 = lane.into();
+                    let imm: i64 = imm.into();
                    sink.put1(imm as u8);
                "#,
                ),
@ -1001,17 +1020,17 @@ pub(crate) fn define<'shared>(
    // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane)
    {
        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("r_ib_unsigned_gpr", &formats.extract_lane, 2)
+            EncodingRecipeBuilder::new("r_ib_unsigned_gpr", &formats.binary_imm8, 2)
                .operands_in(vec![fpr])
                .operands_out(vec![gpr])
                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
-                    &*formats.extract_lane, "lane", 8, 0,
+                    &*formats.binary_imm8, "imm", 8, 0,
                ))
                .emit(
                    r#"
                    {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink);
                    modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte
-                    let imm:i64 = lane.into();
+                    let imm: i64 = imm.into();
                    sink.put1(imm as u8);
                "#,
                ), "size_with_inferred_rex_for_inreg0_outreg0"
@ -1021,12 +1040,12 @@ pub(crate) fn define<'shared>(
    // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane)
    {
        recipes.add_template_inferred(
-            EncodingRecipeBuilder::new("r_ib_unsigned_r", &formats.insert_lane, 2)
+            EncodingRecipeBuilder::new("r_ib_unsigned_r", &formats.ternary_imm8, 2)
                .operands_in(vec![fpr, gpr])
                .operands_out(vec![0])
                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
-                    &*formats.insert_lane,
-                    "lane",
+                    &*formats.ternary_imm8,
+                    "imm",
                    8,
                    0,
                ))
@ -1034,7 +1053,7 @@ pub(crate) fn define<'shared>(
                    r#"
                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
                    modrm_rr(in_reg1, in_reg0, sink);
-                    let imm:i64 = lane.into();
+                    let imm: i64 = imm.into();
                    sink.put1(imm as u8);
                "#,
                ),
@ -1432,23 +1451,7 @@ pub(crate) fn define<'shared>(
    // TODO Alternative forms for 8-bit immediates, when applicable.

    recipes.add_template_recipe(
-        EncodingRecipeBuilder::new("spaddr4_id", &formats.stack_load, 6)
-            .operands_out(vec![gpr])
-            .emit(
-                r#"
-                    let sp = StackRef::sp(stack_slot, &func.stack_slots);
-                    let base = stk_base(sp.base);
-                    {{PUT_OP}}(bits, rex2(out_reg0, base), sink);
-                    modrm_sib_disp8(out_reg0, sink);
-                    sib_noindex(base, sink);
-                    let imm : i32 = offset.into();
-                    sink.put4(sp.offset.checked_add(imm).unwrap() as u32);
-                "#,
-            ),
-    );
-
-    recipes.add_template_recipe(
-        EncodingRecipeBuilder::new("spaddr8_id", &formats.stack_load, 6)
+        EncodingRecipeBuilder::new("spaddr_id", &formats.stack_load, 6)
            .operands_out(vec![gpr])
            .emit(
                r#"
@ -2871,12 +2874,12 @@ pub(crate) fn define<'shared>(

    {
        let has_small_offset =
-            InstructionPredicate::new_is_signed_int(&*formats.binary_imm, "imm", 8, 0);
+            InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 8, 0);

        // XX /n, MI form with imm8.
        recipes.add_template(
            Template::new(
-                EncodingRecipeBuilder::new("rcmp_ib", &formats.binary_imm, 2)
+                EncodingRecipeBuilder::new("rcmp_ib", &formats.binary_imm64, 2)
                    .operands_in(vec![gpr])
                    .operands_out(vec![reg_rflags])
                    .inst_predicate(has_small_offset)
@ -2894,12 +2897,12 @@ pub(crate) fn define<'shared>(
        );

        let has_big_offset =
-            InstructionPredicate::new_is_signed_int(&*formats.binary_imm, "imm", 32, 0);
+            InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 32, 0);

        // XX /n, MI form with imm32.
        recipes.add_template(
            Template::new(
-                EncodingRecipeBuilder::new("rcmp_id", &formats.binary_imm, 5)
+                EncodingRecipeBuilder::new("rcmp_id", &formats.binary_imm64, 5)
                    .operands_in(vec![gpr])
                    .operands_out(vec![reg_rflags])
                    .inst_predicate(has_big_offset)
--- a/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs
@ -3,6 +3,12 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
 pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    let mut settings = SettingGroupBuilder::new("x86");

+    settings.add_bool(
+        "use_new_backend",
+        "Whether to use the new codegen backend using the new isel",
+        false,
+    );
+
    // CPUID.01H:ECX
    let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
    let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
--- a/third_party/rust/cranelift-codegen-meta/src/shared/formats.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/shared/formats.rs
@ -4,7 +4,7 @@ use std::rc::Rc;

 pub(crate) struct Formats {
    pub(crate) binary: Rc<InstructionFormat>,
-    pub(crate) binary_imm: Rc<InstructionFormat>,
+    pub(crate) binary_imm64: Rc<InstructionFormat>,
    pub(crate) branch: Rc<InstructionFormat>,
    pub(crate) branch_float: Rc<InstructionFormat>,
    pub(crate) branch_icmp: Rc<InstructionFormat>,
@ -17,14 +17,13 @@ pub(crate) struct Formats {
    pub(crate) cond_trap: Rc<InstructionFormat>,
    pub(crate) copy_special: Rc<InstructionFormat>,
    pub(crate) copy_to_ssa: Rc<InstructionFormat>,
-    pub(crate) extract_lane: Rc<InstructionFormat>,
+    pub(crate) binary_imm8: Rc<InstructionFormat>,
    pub(crate) float_compare: Rc<InstructionFormat>,
    pub(crate) float_cond: Rc<InstructionFormat>,
    pub(crate) float_cond_trap: Rc<InstructionFormat>,
    pub(crate) func_addr: Rc<InstructionFormat>,
    pub(crate) heap_addr: Rc<InstructionFormat>,
    pub(crate) indirect_jump: Rc<InstructionFormat>,
-    pub(crate) insert_lane: Rc<InstructionFormat>,
    pub(crate) int_compare: Rc<InstructionFormat>,
    pub(crate) int_compare_imm: Rc<InstructionFormat>,
    pub(crate) int_cond: Rc<InstructionFormat>,
@ -45,6 +44,7 @@ pub(crate) struct Formats {
    pub(crate) store_complex: Rc<InstructionFormat>,
    pub(crate) table_addr: Rc<InstructionFormat>,
    pub(crate) ternary: Rc<InstructionFormat>,
+    pub(crate) ternary_imm8: Rc<InstructionFormat>,
    pub(crate) trap: Rc<InstructionFormat>,
    pub(crate) unary: Rc<InstructionFormat>,
    pub(crate) unary_bool: Rc<InstructionFormat>,
@ -76,7 +76,9 @@ impl Formats {

            binary: Builder::new("Binary").value().value().build(),

-            binary_imm: Builder::new("BinaryImm").value().imm(&imm.imm64).build(),
+            binary_imm8: Builder::new("BinaryImm8").value().imm(&imm.uimm8).build(),
+
+            binary_imm64: Builder::new("BinaryImm64").value().imm(&imm.imm64).build(),

            // The select instructions are controlled by the second VALUE operand.
            // The first VALUE operand is the controlling flag which has a derived type.
@ -88,23 +90,18 @@ impl Formats {
                .typevar_operand(1)
                .build(),

+            ternary_imm8: Builder::new("TernaryImm8")
+                .value()
+                .imm(&imm.uimm8)
+                .value()
+                .build(),
+
            // Catch-all for instructions with many outputs and inputs and no immediate
            // operands.
            multiary: Builder::new("MultiAry").varargs().build(),

            nullary: Builder::new("NullAry").build(),

-            insert_lane: Builder::new("InsertLane")
-                .value()
-                .imm_with_name("lane", &imm.uimm8)
-                .value()
-                .build(),
-
-            extract_lane: Builder::new("ExtractLane")
-                .value()
-                .imm_with_name("lane", &imm.uimm8)
-                .build(),
-
            shuffle: Builder::new("Shuffle")
                .value()
                .value()
--- a/third_party/rust/cranelift-codegen-meta/src/shared/instructions.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/shared/instructions.rs
@ -559,9 +559,9 @@ fn define_simd_lane_access(
        The lane index, ``Idx``, is an immediate value, not an SSA value. It
        must indicate a valid lane index for the type of ``x``.
        "#,
-            &formats.insert_lane,
+            &formats.ternary_imm8,
        )
-        .operands_in(vec![x, Idx, y])
+        .operands_in(vec![x, y, Idx])
        .operands_out(vec![a]),
    );

@ -579,7 +579,7 @@ fn define_simd_lane_access(
        may or may not be zeroed depending on the ISA but the type system should prevent using
        ``a`` as anything other than the extracted value.
        "#,
-            &formats.extract_lane,
+            &formats.binary_imm8,
        )
        .operands_in(vec![x, Idx])
        .operands_out(vec![a]),
@ -1172,6 +1172,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "uload8x8_complex",
+            r#"
+        Load an 8x8 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an 
+        i16x8 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    ig.push(
        Inst::new(
            "sload8x8",
@ -1186,6 +1200,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "sload8x8_complex",
+            r#"
+        Load an 8x8 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an 
+        i16x8 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    let I32x4 = &TypeVar::new(
        "I32x4",
        "A SIMD vector with exactly 4 lanes of 32-bit values",
@ -1201,7 +1229,7 @@ pub(crate) fn define(
        Inst::new(
            "uload16x4",
            r#"
-        Load an 16x4 vector (64 bits) from memory at ``p + Offset`` and zero-extend into an i32x4 
+        Load a 16x4 vector (64 bits) from memory at ``p + Offset`` and zero-extend into an i32x4 
        vector.
        "#,
            &formats.load,
@ -1211,6 +1239,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "uload16x4_complex",
+            r#"
+        Load a 16x4 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an 
+        i32x4 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    ig.push(
        Inst::new(
            "sload16x4",
@ -1225,6 +1267,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "sload16x4_complex",
+            r#"
+        Load a 16x4 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an 
+        i32x4 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    let I64x2 = &TypeVar::new(
        "I64x2",
        "A SIMD vector with exactly 2 lanes of 64-bit values",
@ -1250,6 +1306,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "uload32x2_complex",
+            r#"
+        Load a 32x2 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an 
+        i64x2 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    ig.push(
        Inst::new(
            "sload32x2",
@ -1264,6 +1334,20 @@ pub(crate) fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "sload32x2_complex",
+            r#"
+        Load a 32x2 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an 
+        i64x2 vector.
+        "#,
+            &formats.load_complex,
+        )
+        .operands_in(vec![MemFlags, args, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    let x = &Operand::new("x", Mem).with_doc("Value to be stored");
    let a = &Operand::new("a", Mem).with_doc("Value loaded");
    let Offset =
@ -2131,7 +2215,7 @@ pub(crate) fn define(
        Like `icmp_imm`, but returns integer CPU flags instead of testing
        a specific condition code.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![f]),
@ -2181,7 +2265,7 @@ pub(crate) fn define(
        This is similar to `iadd` but the operands are interpreted as signed integers and their
        summed result, instead of wrapping, will be saturated to the lowest or highest
        signed integer for the controlling type (e.g. `0x80` or `0x7F` for i8). For example,
-        since an `iadd_ssat.i8` of `0x70` and `0x70` is greater than `0x7F`, the result will be
+        since an `sadd_sat.i8` of `0x70` and `0x70` is greater than `0x7F`, the result will be
        clamped to `0x7F`.
        "#,
            &formats.binary,
@ -2376,7 +2460,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2391,7 +2475,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2405,7 +2489,7 @@ pub(crate) fn define(

        This operation traps if the divisor is zero.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2421,7 +2505,7 @@ pub(crate) fn define(
        representable in `B` bits two's complement. This only happens
        when `x = -2^{B-1}, Y = -1`.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2435,7 +2519,7 @@ pub(crate) fn define(

        This operation traps if the divisor is zero.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2449,7 +2533,7 @@ pub(crate) fn define(

        This operation traps if the divisor is zero.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2468,7 +2552,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2868,7 +2952,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2885,7 +2969,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2902,7 +2986,7 @@ pub(crate) fn define(
        Polymorphic over all scalar integer types, but does not support vector
        types.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2947,7 +3031,7 @@ pub(crate) fn define(
            r#"
        Rotate left by immediate.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -2959,7 +3043,7 @@ pub(crate) fn define(
            r#"
        Rotate right by immediate.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -3034,7 +3118,7 @@ pub(crate) fn define(

        The shift amount is masked to the size of ``x``.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -3048,7 +3132,7 @@ pub(crate) fn define(

        The shift amount is masked to the size of the register.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
@ -3062,7 +3146,7 @@ pub(crate) fn define(

        The shift amount is masked to the size of the register.
        "#,
-            &formats.binary_imm,
+            &formats.binary_imm64,
        )
        .operands_in(vec![x, Y])
        .operands_out(vec![a]),
--- a/third_party/rust/cranelift-codegen-meta/src/shared/legalize.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/shared/legalize.rs
@ -61,6 +61,7 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
    let cls = insts.by_name("cls");
    let clz = insts.by_name("clz");
    let ctz = insts.by_name("ctz");
+    let copy = insts.by_name("copy");
    let fabs = insts.by_name("fabs");
    let f32const = insts.by_name("f32const");
    let f64const = insts.by_name("f64const");
@ -198,8 +199,6 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
    let ah = var("ah");
    let cc = var("cc");
    let block = var("block");
-    let block1 = var("block1");
-    let block2 = var("block2");
    let ptr = var("ptr");
    let flags = var("flags");
    let offset = var("off");
@ -212,8 +211,8 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
    // embedded as part of arguments), so use a custom legalization for now.
    narrow.custom_legalize(iconst, "narrow_iconst");

-    {
-        let inst = uextend.bind(I128).bind(I64);
+    for &(ty, ty_half) in &[(I128, I64), (I64, I32)] {
+        let inst = uextend.bind(ty).bind(ty_half);
        narrow.legalize(
            def!(a = inst(x)),
            vec![
@ -223,12 +222,12 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
        );
    }

-    {
-        let inst = sextend.bind(I128).bind(I64);
+    for &(ty, ty_half, shift) in &[(I128, I64, 63), (I64, I32, 31)] {
+        let inst = sextend.bind(ty).bind(ty_half);
        narrow.legalize(
            def!(a = inst(x)),
            vec![
-                def!(ah = sshr_imm(x, Literal::constant(&imm.imm64, 63))), // splat sign bit to whole number
+                def!(ah = sshr_imm(x, Literal::constant(&imm.imm64, shift))), // splat sign bit to whole number
                def!(a = iconcat(x, ah)),
            ],
        );
@ -268,39 +267,45 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
        ],
    );

-    narrow.legalize(
-        def!(brz.I128(x, block, vararg)),
-        vec![
-            def!((xl, xh) = isplit(x)),
-            def!(
-                a = icmp_imm(
-                    Literal::enumerator_for(&imm.intcc, "eq"),
-                    xl,
-                    Literal::constant(&imm.imm64, 0)
-                )
-            ),
-            def!(
-                b = icmp_imm(
-                    Literal::enumerator_for(&imm.intcc, "eq"),
-                    xh,
-                    Literal::constant(&imm.imm64, 0)
-                )
-            ),
-            def!(c = band(a, b)),
-            def!(brnz(c, block, vararg)),
-        ],
-    );
+    for &ty in &[I128, I64] {
+        let block = var("block");
+        let block1 = var("block1");
+        let block2 = var("block2");

-    narrow.legalize(
-        def!(brnz.I128(x, block1, vararg)),
-        vec![
-            def!((xl, xh) = isplit(x)),
-            def!(brnz(xl, block1, vararg)),
-            def!(jump(block2, Literal::empty_vararg())),
-            block!(block2),
-            def!(brnz(xh, block1, vararg)),
-        ],
-    );
+        narrow.legalize(
+            def!(brz.ty(x, block, vararg)),
+            vec![
+                def!((xl, xh) = isplit(x)),
+                def!(
+                    a = icmp_imm(
+                        Literal::enumerator_for(&imm.intcc, "eq"),
+                        xl,
+                        Literal::constant(&imm.imm64, 0)
+                    )
+                ),
+                def!(
+                    b = icmp_imm(
+                        Literal::enumerator_for(&imm.intcc, "eq"),
+                        xh,
+                        Literal::constant(&imm.imm64, 0)
+                    )
+                ),
+                def!(c = band(a, b)),
+                def!(brnz(c, block, vararg)),
+            ],
+        );
+
+        narrow.legalize(
+            def!(brnz.ty(x, block1, vararg)),
+            vec![
+                def!((xl, xh) = isplit(x)),
+                def!(brnz(xl, block1, vararg)),
+                def!(jump(block2, Literal::empty_vararg())),
+                block!(block2),
+                def!(brnz(xh, block1, vararg)),
+            ],
+        );
+    }

    narrow.legalize(
        def!(a = popcnt.I128(x)),
@ -629,6 +634,14 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
        );
    }

+    for &(ty_half, ty) in &[(I64, I128), (I32, I64)] {
+        let inst = ireduce.bind(ty_half).bind(ty);
+        expand.legalize(
+            def!(a = inst(x)),
+            vec![def!((b, c) = isplit(x)), def!(a = copy(b))],
+        );
+    }
+
    // Expand integer operations with carry for RISC architectures that don't have
    // the flags.
    let intcc_ult = Literal::enumerator_for(&imm.intcc, "ult");
--- a/third_party/rust/cranelift-codegen-meta/src/shared/settings.rs
+++ b/third_party/rust/cranelift-codegen-meta/src/shared/settings.rs
@ -25,11 +25,14 @@ pub(crate) fn define() -> SettingGroup {
        - `experimental_linear_scan` is an experimental linear scan allocator. It may take less
        time to allocate registers, but generated code's quality may be inferior. As of
        2020-04-17, it is still experimental and it should not be used in production settings.
+        - `experimental_linear_scan_checked` is the linear scan allocator with additional self
+        checks that may take some time to run, and thus these checks are disabled by default.
    "#,
        vec![
            "backtracking",
            "backtracking_checked",
            "experimental_linear_scan",
+            "experimental_linear_scan_checked",
        ],
    );

--- a/third_party/rust/cranelift-codegen-shared/.cargo-checksum.json
+++ b/third_party/rust/cranelift-codegen-shared/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"702a281a26cf7099e1b3ca5e8bea145c113f52242be4f1e7e5b06bf129092599","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"a410bc2f5dcbde499c0cd299c2620bc8111e3c5b3fccdd9e2d85caf3c24fdab3","src/condcodes.rs":"b8d433b2217b86e172d25b6c65a3ce0cc8ca221062cad1b28b0c78d2159fbda9","src/constant_hash.rs":"ffc619f45aad62c6fdcb83553a05879691a72e9a0103375b2d6cc12d52cf72d0","src/constants.rs":"fed03a10a6316e06aa174091db6e7d1fbb5f73c82c31193012ec5ab52f1c603a","src/isa/mod.rs":"428a950eca14acbe783899ccb1aecf15027f8cbe205578308ebde203d10535f3","src/isa/x86/encoding_bits.rs":"7e013fb804b13f9f83a0d517c6f5105856938d08ad378cc44a6fe6a59adef270","src/isa/x86/mod.rs":"01ef4e4d7437f938badbe2137892183c1ac684da0f68a5bec7e06aad34f43b9b","src/lib.rs":"91f26f998f11fb9cb74d2ec171424e29badd417beef023674850ace57149c656"},"package":null}
+{"files":{"Cargo.toml":"d3026bf5426d767b0b23f0a4f6272aaeb68f598a92f6c788c1f6948153fa63c3","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"a410bc2f5dcbde499c0cd299c2620bc8111e3c5b3fccdd9e2d85caf3c24fdab3","src/condcodes.rs":"b8d433b2217b86e172d25b6c65a3ce0cc8ca221062cad1b28b0c78d2159fbda9","src/constant_hash.rs":"ffc619f45aad62c6fdcb83553a05879691a72e9a0103375b2d6cc12d52cf72d0","src/constants.rs":"fed03a10a6316e06aa174091db6e7d1fbb5f73c82c31193012ec5ab52f1c603a","src/isa/mod.rs":"428a950eca14acbe783899ccb1aecf15027f8cbe205578308ebde203d10535f3","src/isa/x86/encoding_bits.rs":"7e013fb804b13f9f83a0d517c6f5105856938d08ad378cc44a6fe6a59adef270","src/isa/x86/mod.rs":"01ef4e4d7437f938badbe2137892183c1ac684da0f68a5bec7e06aad34f43b9b","src/lib.rs":"91f26f998f11fb9cb74d2ec171424e29badd417beef023674850ace57149c656"},"package":null}
--- a/third_party/rust/cranelift-codegen-shared/Cargo.toml
+++ b/third_party/rust/cranelift-codegen-shared/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-codegen-shared"
-version = "0.63.0"
+version = "0.64.0"
 description = "For code shared between cranelift-codegen-meta and cranelift-codegen"
 license = "Apache-2.0 WITH LLVM-exception"
 repository = "https://github.com/bytecodealliance/wasmtime"
--- a/third_party/rust/cranelift-codegen/.cargo-checksum.json
+++ b/third_party/rust/cranelift-codegen/.cargo-checksum.json
--- a/third_party/rust/cranelift-codegen/Cargo.toml
+++ b/third_party/rust/cranelift-codegen/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-codegen"
-version = "0.63.0"
+version = "0.64.0"
 description = "Low-level code generator library"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-codegen"
@ -13,25 +13,27 @@ build = "build.rs"
 edition = "2018"

 [dependencies]
-cranelift-codegen-shared = { path = "./shared", version = "0.63.0" }
-cranelift-entity = { path = "../entity", version = "0.63.0" }
-cranelift-bforest = { path = "../bforest", version = "0.63.0" }
+cranelift-codegen-shared = { path = "./shared", version = "0.64.0" }
+cranelift-entity = { path = "../entity", version = "0.64.0" }
+cranelift-bforest = { path = "../bforest", version = "0.64.0" }
 hashbrown = { version = "0.7", optional = true }
 target-lexicon = "0.10"
 log = { version = "0.4.6", default-features = false }
 serde = { version = "1.0.94", features = ["derive"], optional = true }
-gimli = { version = "0.20.0", default-features = false, features = ["write"], optional = true }
+gimli = { version = "0.21.0", default-features = false, features = ["write"], optional = true }
 smallvec = { version = "1.0.0" }
 thiserror = "1.0.4"
 byteorder = { version = "1.3.2", default-features = false }
-regalloc = "0.0.21"
+peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true, version = "0.1.0" }
+regalloc = "0.0.25"
 # It is a goal of the cranelift-codegen crate to have minimal external dependencies.
 # Please don't add any unless they are essential to the task of creating binary
 # machine code. Integration tests that need external dependencies can be
 # accomodated in `tests`.

 [build-dependencies]
-cranelift-codegen-meta = { path = "meta", version = "0.63.0" }
+cranelift-codegen-meta = { path = "meta", version = "0.64.0" }
+peepmatic = { path = "../peepmatic", optional = true, version = "0.64.0" }

 [features]
 default = ["std", "unwind"]
@ -58,10 +60,12 @@ x86 = []
 arm32 = []
 arm64 = []
 riscv = []
+x64 = [] # New work-in-progress codegen backend for x86_64 based on the new isel.

 # Option to enable all architectures.
 all-arch = [
    "x86",
+    "x64",
    "arm32",
    "arm64",
    "riscv"
@ -70,5 +74,12 @@ all-arch = [
 # For dependent crates that want to serialize some parts of cranelift
 enable-serde = ["serde"]

+# Recompile our optimizations that are written in the `peepmatic` DSL into a
+# compact finite-state transducer automaton.
+rebuild-peephole-optimizers = ["peepmatic"]
+
+# Enable the use of `peepmatic`-generated peephole optimizers.
+enable-peepmatic = ["peepmatic-runtime"]
+
 [badges]
 maintenance = { status = "experimental" }
--- a/third_party/rust/cranelift-codegen/build.rs
+++ b/third_party/rust/cranelift-codegen/build.rs
@ -71,4 +71,22 @@ fn main() {
        );
        println!("cargo:warning=Generated files are in {}", out_dir);
    }
+
+    #[cfg(feature = "rebuild-peephole-optimizers")]
+    rebuild_peephole_optimizers();
+}
+
+#[cfg(feature = "rebuild-peephole-optimizers")]
+fn rebuild_peephole_optimizers() {
+    use std::path::Path;
+
+    let source_path = Path::new("src").join("preopt.peepmatic");
+    println!("cargo:rerun-if-changed={}", source_path.display());
+
+    let preopt =
+        peepmatic::compile_file(&source_path).expect("failed to compile `src/preopt.peepmatic`");
+
+    preopt
+        .serialize_to_file(&Path::new("src").join("preopt.serialized"))
+        .expect("failed to serialize peephole optimizer to `src/preopt.serialized`");
 }
--- a/third_party/rust/cranelift-codegen/src/abi.rs
+++ b/third_party/rust/cranelift-codegen/src/abi.rs
@ -54,6 +54,9 @@ pub enum ValueConversion {

    /// Unsigned zero-extend value to the required type.
    Uext(Type),
+
+    /// Pass value by pointer of given integer type.
+    Pointer(Type),
 }

 impl ValueConversion {
@ -63,7 +66,7 @@ impl ValueConversion {
            Self::IntSplit => ty.half_width().expect("Integer type too small to split"),
            Self::VectorSplit => ty.half_vector().expect("Not a vector"),
            Self::IntBits => Type::int(ty.bits()).expect("Bad integer size"),
-            Self::Sext(nty) | Self::Uext(nty) => nty,
+            Self::Sext(nty) | Self::Uext(nty) | Self::Pointer(nty) => nty,
        }
    }

@ -74,6 +77,11 @@ impl ValueConversion {
            _ => false,
        }
    }
+
+    /// Is this a conversion to pointer?
+    pub fn is_pointer(self) -> bool {
+        matches!(self, Self::Pointer(_))
+    }
 }

 /// Common trait for assigning arguments to registers or stack locations.
@ -110,10 +118,16 @@ pub fn legalize_args<AA: ArgAssigner>(args: &[AbiParam], aa: &mut AA) -> Option<
            }
            // Split this argument into two smaller ones. Then revisit both.
            ArgAction::Convert(conv) => {
+                debug_assert!(
+                    !arg.legalized_to_pointer,
+                    "No more conversions allowed after conversion to pointer"
+                );
                let value_type = conv.apply(arg.value_type);
-                let new_arg = AbiParam { value_type, ..arg };
                args.to_mut()[argno].value_type = value_type;
-                if conv.is_split() {
+                if conv.is_pointer() {
+                    args.to_mut()[argno].legalized_to_pointer = true;
+                } else if conv.is_split() {
+                    let new_arg = AbiParam { value_type, ..arg };
                    args.to_mut().insert(argno + 1, new_arg);
                }
            }
@ -152,6 +166,10 @@ pub fn legalize_abi_value(have: Type, arg: &AbiParam) -> ValueConversion {
    let have_bits = have.bits();
    let arg_bits = arg.value_type.bits();

+    if arg.legalized_to_pointer {
+        return ValueConversion::Pointer(arg.value_type);
+    }
+
    match have_bits.cmp(&arg_bits) {
        // We have fewer bits than the ABI argument.
        Ordering::Less => {
@ -226,5 +244,12 @@ mod tests {
            legalize_abi_value(types::F64, &arg),
            ValueConversion::IntBits
        );
+
+        // Value is passed by reference
+        arg.legalized_to_pointer = true;
+        assert_eq!(
+            legalize_abi_value(types::F64, &arg),
+            ValueConversion::Pointer(types::I32)
+        );
    }
 }
--- a/third_party/rust/cranelift-codegen/src/binemit/stackmap.rs
+++ b/third_party/rust/cranelift-codegen/src/binemit/stackmap.rs
@ -15,7 +15,8 @@ const NUM_BITS: usize = core::mem::size_of::<Num>() * 8;
 /// The first value in the bitmap is of the lowest addressed slot on the stack.
 /// As all stacks in Isa's supported by Cranelift grow down, this means that
 /// first value is of the top of the stack and values proceed down the stack.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(serde::Deserialize, serde::Serialize))]
 pub struct Stackmap {
    bitmap: Vec<BitSet<Num>>,
    mapped_words: u32,
--- a/third_party/rust/cranelift-codegen/src/bitset.rs
+++ b/third_party/rust/cranelift-codegen/src/bitset.rs
@ -5,12 +5,14 @@
 //!
 //! If you would like to add support for larger bitsets in the future, you need to change the trait
 //! bound Into<u32> and the u32 in the implementation of `max_bits()`.
+
 use core::convert::{From, Into};
 use core::mem::size_of;
 use core::ops::{Add, BitOr, Shl, Sub};

 /// A small bitset built on a single primitive integer type
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct BitSet<T>(pub T);

 impl<T> BitSet<T>
--- a/third_party/rust/cranelift-codegen/src/context.rs
+++ b/third_party/rust/cranelift-codegen/src/context.rs
@ -27,6 +27,7 @@ use crate::nan_canonicalization::do_nan_canonicalization;
 use crate::postopt::do_postopt;
 use crate::redundant_reload_remover::RedundantReloadRemover;
 use crate::regalloc;
+use crate::remove_constant_phis::do_remove_constant_phis;
 use crate::result::CodegenResult;
 use crate::settings::{FlagsOrIsa, OptLevel};
 use crate::simple_gvn::do_simple_gvn;
@ -179,6 +180,8 @@ impl Context {
            self.dce(isa)?;
        }

+        self.remove_constant_phis(isa)?;
+
        if let Some(backend) = isa.get_mach_backend() {
            let result = backend.compile_function(&self.func, self.want_disasm)?;
            let info = result.code_info();
@ -224,7 +227,7 @@ impl Context {
        let _tt = timing::binemit();
        let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
        if let Some(ref result) = &self.mach_compile_result {
-            result.sections.emit(&mut sink);
+            result.buffer.emit(&mut sink);
        } else {
            isa.emit_function_to_memory(&self.func, &mut sink);
        }
@ -292,6 +295,16 @@ impl Context {
        Ok(())
    }

+    /// Perform constant-phi removal on the function.
+    pub fn remove_constant_phis<'a, FOI: Into<FlagsOrIsa<'a>>>(
+        &mut self,
+        fisa: FOI,
+    ) -> CodegenResult<()> {
+        do_remove_constant_phis(&mut self.func, &mut self.domtree);
+        self.verify_if(fisa)?;
+        Ok(())
+    }
+
    /// Perform pre-legalization rewrites on the function.
    pub fn preopt(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
        do_preopt(&mut self.func, &mut self.cfg, isa);
--- a/third_party/rust/cranelift-codegen/src/cursor.rs
+++ b/third_party/rust/cranelift-codegen/src/cursor.rs
@ -794,15 +794,20 @@ impl<'c, 'f> ir::InstInserterBase<'c> for &'c mut EncCursor<'f> {
        if !self.srcloc.is_default() {
            self.func.srclocs[inst] = self.srcloc;
        }
-        // Assign an encoding.
-        // XXX Is there a way to describe this error to the user?
-        #[cfg_attr(feature = "cargo-clippy", allow(clippy::match_wild_err_arm))]
-        match self
-            .isa
-            .encode(&self.func, &self.func.dfg[inst], ctrl_typevar)
-        {
-            Ok(e) => self.func.encodings[inst] = e,
-            Err(_) => panic!("can't encode {}", self.display_inst(inst)),
+
+        // Skip the encoding update if we're using a new (MachInst) backend; encodings come later,
+        // during lowering.
+        if self.isa.get_mach_backend().is_none() {
+            // Assign an encoding.
+            // XXX Is there a way to describe this error to the user?
+            #[cfg_attr(feature = "cargo-clippy", allow(clippy::match_wild_err_arm))]
+            match self
+                .isa
+                .encode(&self.func, &self.func.dfg[inst], ctrl_typevar)
+            {
+                Ok(e) => self.func.encodings[inst] = e,
+                Err(_) => panic!("can't encode {}", self.display_inst(inst)),
+            }
        }

        &mut self.func.dfg
--- a/third_party/rust/cranelift-codegen/src/inst_predicates.rs
+++ b/third_party/rust/cranelift-codegen/src/inst_predicates.rs
@ -40,3 +40,24 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
    let opcode = data.opcode();
    trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
 }
+
+/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load?
+pub fn has_side_effect_or_load(func: &Function, inst: Inst) -> bool {
+    has_side_effect(func, inst) || func.dfg[inst].opcode().can_load()
+}
+
+/// Is the given instruction a constant value (`iconst`, `fconst`, `bconst`) that can be
+/// represented in 64 bits?
+pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option<u64> {
+    let data = &func.dfg[inst];
+    if data.opcode() == Opcode::Null {
+        return Some(0);
+    }
+    match data {
+        &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()),
+        &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }),
+        _ => None,
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/ir/dfg.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/dfg.rs
@ -234,11 +234,7 @@ impl DataFlowGraph {

    /// Get the type of a value.
    pub fn value_type(&self, v: Value) -> Type {
-        match self.values[v] {
-            ValueData::Inst { ty, .. }
-            | ValueData::Param { ty, .. }
-            | ValueData::Alias { ty, .. } => ty,
-        }
+        self.values[v].ty()
    }

    /// Get the definition of a value.
@ -383,9 +379,14 @@ pub enum ValueDef {
 impl ValueDef {
    /// Unwrap the instruction where the value was defined, or panic.
    pub fn unwrap_inst(&self) -> Inst {
+        self.inst().expect("Value is not an instruction result")
+    }
+
+    /// Get the instruction where the value was defined, if any.
+    pub fn inst(&self) -> Option<Inst> {
        match *self {
-            Self::Result(inst, _) => inst,
-            _ => panic!("Value is not an instruction result"),
+            Self::Result(inst, _) => Some(inst),
+            _ => None,
        }
    }

@ -428,6 +429,16 @@ enum ValueData {
    Alias { ty: Type, original: Value },
 }

+impl ValueData {
+    fn ty(&self) -> Type {
+        match *self {
+            ValueData::Inst { ty, .. }
+            | ValueData::Param { ty, .. }
+            | ValueData::Alias { ty, .. } => ty,
+        }
+    }
+}
+
 /// Instructions.
 ///
 impl DataFlowGraph {
--- a/third_party/rust/cranelift-codegen/src/ir/extfunc.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/extfunc.rs
@ -7,6 +7,7 @@

 use crate::ir::{ArgumentLoc, ExternalName, SigRef, Type};
 use crate::isa::{CallConv, RegInfo, RegUnit};
+use crate::machinst::RelocDistance;
 use alloc::vec::Vec;
 use core::fmt;
 use core::str::FromStr;
@ -155,6 +156,8 @@ pub struct AbiParam {
    /// ABI-specific location of this argument, or `Unassigned` for arguments that have not yet
    /// been legalized.
    pub location: ArgumentLoc,
+    /// Was the argument converted to pointer during legalization?
+    pub legalized_to_pointer: bool,
 }

 impl AbiParam {
@ -165,6 +168,7 @@ impl AbiParam {
            extension: ArgumentExtension::None,
            purpose: ArgumentPurpose::Normal,
            location: Default::default(),
+            legalized_to_pointer: false,
        }
    }

@ -175,6 +179,7 @@ impl AbiParam {
            extension: ArgumentExtension::None,
            purpose,
            location: Default::default(),
+            legalized_to_pointer: false,
        }
    }

@ -185,6 +190,7 @@ impl AbiParam {
            extension: ArgumentExtension::None,
            purpose,
            location: ArgumentLoc::Reg(regunit),
+            legalized_to_pointer: false,
        }
    }

@ -218,6 +224,9 @@ pub struct DisplayAbiParam<'a>(&'a AbiParam, Option<&'a RegInfo>);
 impl<'a> fmt::Display for DisplayAbiParam<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}", self.0.value_type)?;
+        if self.0.legalized_to_pointer {
+            write!(f, " ptr")?;
+        }
        match self.0.extension {
            ArgumentExtension::None => {}
            ArgumentExtension::Uext => write!(f, " uext")?,
@ -366,6 +375,16 @@ pub struct ExtFuncData {
    /// Will this function be defined nearby, such that it will always be a certain distance away,
    /// after linking? If so, references to it can avoid going through a GOT or PLT. Note that
    /// symbols meant to be preemptible cannot be considered colocated.
+    ///
+    /// If `true`, some backends may use relocation forms that have limited range. The exact
+    /// distance depends on the code model in use. Currently on AArch64, for example, Cranelift
+    /// uses a custom code model supporting up to +/- 128MB displacements. If it is unknown how
+    /// far away the target will be, it is best not to set the `colocated` flag; in general, this
+    /// flag is best used when the target is known to be in the same unit of code generation, such
+    /// as a Wasm module.
+    ///
+    /// See the documentation for [`RelocDistance`](machinst::RelocDistance) for more details. A
+    /// `colocated` flag value of `true` implies `RelocDistance::Near`.
    pub colocated: bool,
 }

@ -378,6 +397,17 @@ impl fmt::Display for ExtFuncData {
    }
 }

+impl ExtFuncData {
+    /// Return an estimate of the distance to the referred-to function symbol.
+    pub fn reloc_distance(&self) -> RelocDistance {
+        if self.colocated {
+            RelocDistance::Near
+        } else {
+            RelocDistance::Far
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -393,6 +423,8 @@ mod tests {
        assert_eq!(t.sext().to_string(), "i32 sext");
        t.purpose = ArgumentPurpose::StructReturn;
        assert_eq!(t.to_string(), "i32 uext sret");
+        t.legalized_to_pointer = true;
+        assert_eq!(t.to_string(), "i32 ptr uext sret");
    }

    #[test]
--- a/third_party/rust/cranelift-codegen/src/ir/function.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/function.rs
@ -308,6 +308,30 @@ impl Function {
        // function, assume it is not a leaf.
        self.dfg.signatures.is_empty()
    }
+
+    /// Replace the `dst` instruction's data with the `src` instruction's data
+    /// and then remove `src`.
+    ///
+    /// `src` and its result values should not be used at all, as any uses would
+    /// be left dangling after calling this method.
+    ///
+    /// `src` and `dst` must have the same number of resulting values, and
+    /// `src`'s i^th value must have the same type as `dst`'s i^th value.
+    pub fn transplant_inst(&mut self, dst: Inst, src: Inst) {
+        debug_assert_eq!(
+            self.dfg.inst_results(dst).len(),
+            self.dfg.inst_results(src).len()
+        );
+        debug_assert!(self
+            .dfg
+            .inst_results(dst)
+            .iter()
+            .zip(self.dfg.inst_results(src))
+            .all(|(a, b)| self.dfg.value_type(*a) == self.dfg.value_type(*b)));
+
+        self.dfg[dst] = self.dfg[src].clone();
+        self.layout.remove_inst(src);
+    }
 }

 /// Additional annotations for function display.
--- a/third_party/rust/cranelift-codegen/src/ir/globalvalue.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/globalvalue.rs
@ -3,6 +3,7 @@
 use crate::ir::immediates::{Imm64, Offset32};
 use crate::ir::{ExternalName, GlobalValue, Type};
 use crate::isa::TargetIsa;
+use crate::machinst::RelocDistance;
 use core::fmt;

 /// Information about a global value declaration.
@ -62,6 +63,10 @@ pub enum GlobalValueData {
        /// Will this symbol be defined nearby, such that it will always be a certain distance
        /// away, after linking? If so, references to it can avoid going through a GOT. Note that
        /// symbols meant to be preemptible cannot be colocated.
+        ///
+        /// If `true`, some backends may use relocation forms that have limited range: for example,
+        /// a +/- 2^27-byte range on AArch64. See the documentation for
+        /// [`RelocDistance`](machinst::RelocDistance) for more details.
        colocated: bool,

        /// Does this symbol refer to a thread local storage value?
@ -85,6 +90,20 @@ impl GlobalValueData {
            Self::IAddImm { global_type, .. } | Self::Load { global_type, .. } => global_type,
        }
    }
+
+    /// If this global references a symbol, return an estimate of the relocation distance,
+    /// based on the `colocated` flag.
+    pub fn maybe_reloc_distance(&self) -> Option<RelocDistance> {
+        match self {
+            &GlobalValueData::Symbol {
+                colocated: true, ..
+            } => Some(RelocDistance::Near),
+            &GlobalValueData::Symbol {
+                colocated: false, ..
+            } => Some(RelocDistance::Far),
+            _ => None,
+        }
+    }
 }

 impl fmt::Display for GlobalValueData {
--- a/third_party/rust/cranelift-codegen/src/ir/immediates.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/immediates.rs
@ -62,6 +62,21 @@ impl Imm64 {
    pub fn bits(&self) -> i64 {
        self.0
    }
+
+    /// Sign extend this immediate as if it were a signed integer of the given
+    /// power-of-two width.
+    pub fn sign_extend_from_width(&mut self, bit_width: u16) {
+        debug_assert!(bit_width.is_power_of_two());
+
+        if bit_width >= 64 {
+            return;
+        }
+
+        let bit_width = bit_width as i64;
+        let delta = 64 - bit_width;
+        let sign_extended = (self.0 << delta) >> delta;
+        *self = Imm64(sign_extended);
+    }
 }

 impl Into<i64> for Imm64 {
--- a/third_party/rust/cranelift-codegen/src/ir/instructions.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/instructions.rs
@ -11,9 +11,7 @@ use core::fmt::{self, Display, Formatter};
 use core::ops::{Deref, DerefMut};
 use core::str::FromStr;

-use crate::ir;
-use crate::ir::types;
-use crate::ir::{Block, FuncRef, JumpTable, SigRef, Type, Value};
+use crate::ir::{self, trapcode::TrapCode, types, Block, FuncRef, JumpTable, SigRef, Type, Value};
 use crate::isa;

 use crate::bitset::BitSet;
@ -257,6 +255,30 @@ impl InstructionData {
        }
    }

+    /// If this is a trapping instruction, get its trap code. Otherwise, return
+    /// `None`.
+    pub fn trap_code(&self) -> Option<TrapCode> {
+        match *self {
+            Self::CondTrap { code, .. }
+            | Self::FloatCondTrap { code, .. }
+            | Self::IntCondTrap { code, .. }
+            | Self::Trap { code, .. } => Some(code),
+            _ => None,
+        }
+    }
+
+    /// If this is a trapping instruction, get an exclusive reference to its
+    /// trap code. Otherwise, return `None`.
+    pub fn trap_code_mut(&mut self) -> Option<&mut TrapCode> {
+        match self {
+            Self::CondTrap { code, .. }
+            | Self::FloatCondTrap { code, .. }
+            | Self::IntCondTrap { code, .. }
+            | Self::Trap { code, .. } => Some(code),
+            _ => None,
+        }
+    }
+
    /// Return information about a call instruction.
    ///
    /// Any instruction that can call another function reveals its call signature here.
@ -274,6 +296,39 @@ impl InstructionData {
            }
        }
    }
+
+    #[inline]
+    pub(crate) fn sign_extend_immediates(&mut self, ctrl_typevar: Type) {
+        if ctrl_typevar.is_invalid() {
+            return;
+        }
+
+        let bit_width = ctrl_typevar.bits();
+
+        match self {
+            Self::BinaryImm64 {
+                opcode,
+                arg: _,
+                imm,
+            } => {
+                if matches!(opcode, Opcode::SdivImm | Opcode::SremImm) {
+                    imm.sign_extend_from_width(bit_width);
+                }
+            }
+            Self::IntCompareImm {
+                opcode,
+                arg: _,
+                cond,
+                imm,
+            } => {
+                debug_assert_eq!(*opcode, Opcode::IcmpImm);
+                if cond.unsigned() != *cond {
+                    imm.sign_extend_from_width(bit_width);
+                }
+            }
+            _ => {}
+        }
+    }
 }

 /// Information about branch and jump instructions.
--- a/third_party/rust/cranelift-codegen/src/ir/libcall.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/libcall.rs
@ -24,6 +24,20 @@ pub enum LibCall {
    /// probe for stack overflow. These are emitted for functions which need
    /// when the `enable_probestack` setting is true.
    Probestack,
+    /// udiv.i64
+    UdivI64,
+    /// sdiv.i64
+    SdivI64,
+    /// urem.i64
+    UremI64,
+    /// srem.i64
+    SremI64,
+    /// ishl.i64
+    IshlI64,
+    /// ushr.i64
+    UshrI64,
+    /// sshr.i64
+    SshrI64,
    /// ceil.f32
    CeilF32,
    /// ceil.f64
@ -63,6 +77,13 @@ impl FromStr for LibCall {
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "Probestack" => Ok(Self::Probestack),
+            "UdivI64" => Ok(Self::UdivI64),
+            "SdivI64" => Ok(Self::SdivI64),
+            "UremI64" => Ok(Self::UremI64),
+            "SremI64" => Ok(Self::SremI64),
+            "IshlI64" => Ok(Self::IshlI64),
+            "UshrI64" => Ok(Self::UshrI64),
+            "SshrI64" => Ok(Self::SshrI64),
            "CeilF32" => Ok(Self::CeilF32),
            "CeilF64" => Ok(Self::CeilF64),
            "FloorF32" => Ok(Self::FloorF32),
@ -88,6 +109,16 @@ impl LibCall {
    /// Returns `None` if no well-known library routine name exists for that instruction.
    pub fn for_inst(opcode: Opcode, ctrl_type: Type) -> Option<Self> {
        Some(match ctrl_type {
+            types::I64 => match opcode {
+                Opcode::Udiv => Self::UdivI64,
+                Opcode::Sdiv => Self::SdivI64,
+                Opcode::Urem => Self::UremI64,
+                Opcode::Srem => Self::SremI64,
+                Opcode::Ishl => Self::IshlI64,
+                Opcode::Ushr => Self::UshrI64,
+                Opcode::Sshr => Self::SshrI64,
+                _ => return None,
+            },
            types::F32 => match opcode {
                Opcode::Ceil => Self::CeilF32,
                Opcode::Floor => Self::FloorF32,
--- a/third_party/rust/cranelift-codegen/src/ir/trapcode.rs
+++ b/third_party/rust/cranelift-codegen/src/ir/trapcode.rs
@ -27,9 +27,6 @@ pub enum TrapCode {
    /// A `table_addr` instruction detected an out-of-bounds error.
    TableOutOfBounds,

-    /// Other bounds checking error.
-    OutOfBounds,
-
    /// Indirect call to a null table entry.
    IndirectCallToNull,

@ -63,7 +60,6 @@ impl Display for TrapCode {
            StackOverflow => "stk_ovf",
            HeapOutOfBounds => "heap_oob",
            TableOutOfBounds => "table_oob",
-            OutOfBounds => "oob",
            IndirectCallToNull => "icall_null",
            BadSignature => "bad_sig",
            IntegerOverflow => "int_ovf",
@ -86,7 +82,6 @@ impl FromStr for TrapCode {
            "stk_ovf" => Ok(StackOverflow),
            "heap_oob" => Ok(HeapOutOfBounds),
            "table_oob" => Ok(TableOutOfBounds),
-            "oob" => Ok(OutOfBounds),
            "icall_null" => Ok(IndirectCallToNull),
            "bad_sig" => Ok(BadSignature),
            "int_ovf" => Ok(IntegerOverflow),
@ -106,11 +101,10 @@ mod tests {
    use alloc::string::ToString;

    // Everything but user-defined codes.
-    const CODES: [TrapCode; 11] = [
+    const CODES: [TrapCode; 10] = [
        TrapCode::StackOverflow,
        TrapCode::HeapOutOfBounds,
        TrapCode::TableOutOfBounds,
-        TrapCode::OutOfBounds,
        TrapCode::IndirectCallToNull,
        TrapCode::BadSignature,
        TrapCode::IntegerOverflow,
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs
@ -3,14 +3,14 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]

-use crate::binemit::CodeOffset;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
+use crate::machinst::MachLabel;

 use regalloc::{RealRegUniverse, Reg, Writable};

-use core::convert::{Into, TryFrom};
+use core::convert::Into;
 use std::string::String;

 /// A shift operator for a register or immediate.
@ -112,7 +112,9 @@ pub enum MemLabel {
 /// A memory argument to load/store, encapsulating the possible addressing modes.
 #[derive(Clone, Debug)]
 pub enum MemArg {
-    Label(MemLabel),
+    //
+    // Real ARM64 addressing modes:
+    //
    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
    PostIndexed(Writable<Reg>, SImm9),
    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
@ -137,11 +139,35 @@ pub enum MemArg {
    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
    UnsignedOffset(Reg, UImm12Scaled),

-    /// Offset from the stack pointer. Lowered into a real amode at emission.
-    SPOffset(i64),
+    //
+    // virtual addressing modes that are lowered at emission time:
+    //
+    /// Reference to a "label": e.g., a symbol.
+    Label(MemLabel),

-    /// Offset from the frame pointer. Lowered into a real amode at emission.
-    FPOffset(i64),
+    /// Arbitrary offset from a register. Converted to generation of large
+    /// offsets with multiple instructions as necessary during code emission.
+    RegOffset(Reg, i64, Type),
+
+    /// Offset from the stack pointer.
+    SPOffset(i64, Type),
+
+    /// Offset from the frame pointer.
+    FPOffset(i64, Type),
+
+    /// Offset from the "nominal stack pointer", which is where the real SP is
+    /// just after stack and spill slots are allocated in the function prologue.
+    /// At emission time, this is converted to `SPOffset` with a fixup added to
+    /// the offset constant. The fixup is a running value that is tracked as
+    /// emission iterates through instructions in linear order, and can be
+    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
+    ///
+    /// The standard ABI is in charge of handling this (by emitting the
+    /// adjustment meta-instructions). It maintains the invariant that "nominal
+    /// SP" is where the actual SP is after the function prologue and before
+    /// clobber pushes. See the diagram in the documentation for
+    /// [crate::isa::aarch64::abi](the ABI module) for more details.
+    NominalSPOffset(i64, Type),
 }

 impl MemArg {
@ -152,17 +178,6 @@ impl MemArg {
        MemArg::UnsignedOffset(reg, UImm12Scaled::zero(I64))
    }

-    /// Memory reference using an address in a register and an offset, if possible.
-    pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option<MemArg> {
-        if let Some(simm9) = SImm9::maybe_from_i64(offset) {
-            Some(MemArg::Unscaled(reg, simm9))
-        } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) {
-            Some(MemArg::UnsignedOffset(reg, uimm12s))
-        } else {
-            None
-        }
-    }
-
    /// Memory reference using the sum of two registers as an address.
    pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> MemArg {
        MemArg::RegReg(reg1, reg2)
@ -281,78 +296,44 @@ impl CondBrKind {

 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
+    /// An unresolved reference to a Label, as passed into
    /// `lower_branch_group()`.
-    Block(BlockIndex),
-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.
-    ResolvedOffset(isize),
+    Label(MachLabel),
+    /// A fixed PC offset.
+    ResolvedOffset(i32),
 }

 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
        match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = usize::try_from(bix).unwrap();
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            BranchTarget::Label(l) => Some(l),
            _ => None,
        }
    }

-    /// Get the offset as 4-byte words. Returns `0` if not
-    /// yet resolved (in that case, we're only computing
-    /// size and the offset doesn't matter).
-    pub fn as_offset_words(&self) -> isize {
-        match self {
-            &BranchTarget::ResolvedOffset(off) => off >> 2,
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
            _ => 0,
-        }
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
    }

-    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
-    pub fn as_off26(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 25)) && (off >= -(1 << 25)) {
-            Some((off as u32) & ((1 << 26) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Get the offset as a 19-bit offset, or `None` if overflow.
-    pub fn as_off19(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 18)) && (off >= -(1 << 18)) {
-            Some((off as u32) & ((1 << 19) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[usize::try_from(*bix).unwrap()];
-                *bix = n;
-            }
-            &mut BranchTarget::ResolvedOffset(_) => {}
-        }
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
    }
 }

@ -443,8 +424,11 @@ impl ShowWithRRU for MemArg {
                simm9.show_rru(mb_rru)
            ),
            // Eliminated by `mem_finalize()`.
-            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                panic!("Unexpected stack-offset mem-arg mode!")
+            &MemArg::SPOffset(..)
+            | &MemArg::FPOffset(..)
+            | &MemArg::NominalSPOffset(..)
+            | &MemArg::RegOffset(..) => {
+                panic!("Unexpected pseudo mem-arg mode (stack-offset or generic reg-offset)!")
            }
        }
    }
@ -485,18 +469,21 @@ impl ShowWithRRU for Cond {
 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
        }
    }
 }

 /// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and
-/// 64-bit variants of many instructions (and integer registers).
+/// 64-bit variants of many instructions (and integer and floating-point registers) and 128-bit
+/// variants of vector instructions.
+/// TODO: Create a separate type for SIMD & floating-point operands.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum InstSize {
    Size32,
    Size64,
+    Size128,
 }

 impl InstSize {
@ -519,11 +506,13 @@ impl InstSize {
    /// Convert from a needed width to the smallest size that fits.
    pub fn from_bits<I: Into<usize>>(bits: I) -> InstSize {
        let bits: usize = bits.into();
-        assert!(bits <= 64);
+        assert!(bits <= 128);
        if bits <= 32 {
            InstSize::Size32
-        } else {
+        } else if bits <= 64 {
            InstSize::Size64
+        } else {
+            InstSize::Size128
        }
    }

@ -532,11 +521,12 @@ impl InstSize {
        Self::from_bits(ty_bits(ty))
    }

-    /// Convert to I32 or I64.
+    /// Convert to I32, I64, or I128.
    pub fn to_ty(self) -> Type {
        match self {
            InstSize::Size32 => I32,
            InstSize::Size64 => I64,
+            InstSize::Size128 => I128,
        }
    }

@ -544,6 +534,9 @@ impl InstSize {
        match self {
            InstSize::Size32 => 0,
            InstSize::Size64 => 1,
+            _ => {
+                panic!("Unexpected size");
+            }
        }
    }
 }
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs
@ -4,12 +4,13 @@ use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
-use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*};
+use crate::isa::aarch64::inst::*;
+use crate::isa::aarch64::lower::ty_bits;

 use regalloc::{Reg, RegClass, Writable};

-use alloc::vec::Vec;
 use core::convert::TryFrom;
+use log::debug;

 /// Memory label/reference finalization: convert a MemLabel to a PC-relative
 /// offset, possibly emitting relocation(s) as necessary.
@ -23,43 +24,67 @@ pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
 /// generic arbitrary stack offset) into real addressing modes, possibly by
 /// emitting some helper instructions that come immediately before the use
 /// of this amode.
-pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
+pub fn mem_finalize(
+    insn_off: CodeOffset,
+    mem: &MemArg,
+    state: &EmitState,
+) -> (SmallVec<[Inst; 4]>, MemArg) {
    match mem {
-        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => {
+        &MemArg::RegOffset(_, off, ty)
+        | &MemArg::SPOffset(off, ty)
+        | &MemArg::FPOffset(off, ty)
+        | &MemArg::NominalSPOffset(off, ty) => {
            let basereg = match mem {
-                &MemArg::SPOffset(..) => stack_reg(),
+                &MemArg::RegOffset(reg, _, _) => reg,
+                &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => stack_reg(),
                &MemArg::FPOffset(..) => fp_reg(),
                _ => unreachable!(),
            };
+            let adj = match mem {
+                &MemArg::NominalSPOffset(..) => {
+                    debug!(
+                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
+                        off,
+                        state.virtual_sp_offset,
+                        off + state.virtual_sp_offset
+                    );
+                    state.virtual_sp_offset
+                }
+                _ => 0,
+            };
+            let off = off + adj;
+
            if let Some(simm9) = SImm9::maybe_from_i64(off) {
                let mem = MemArg::Unscaled(basereg, simm9);
-                (vec![], mem)
+                (smallvec![], mem)
+            } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(off, ty) {
+                let mem = MemArg::UnsignedOffset(basereg, uimm12s);
+                (smallvec![], mem)
            } else {
-                // In an addition, x31 is the zero register, not sp; we have only one temporary
-                // so we can't do the proper add here.
-                debug_assert_ne!(
-                    basereg,
-                    stack_reg(),
-                    "should have diverted SP before mem_finalize"
-                );
-
                let tmp = writable_spilltmp_reg();
                let mut const_insts = Inst::load_constant(tmp, off as u64);
-                let add_inst = Inst::AluRRR {
+                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
+                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
+                // is a valid base (for SPOffset) which we must handle here.
+                // Also, SP needs to be the first arg, not second.
+                let add_inst = Inst::AluRRRExtend {
                    alu_op: ALUOp::Add64,
                    rd: tmp,
-                    rn: tmp.to_reg(),
-                    rm: basereg,
+                    rn: basereg,
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
                };
                const_insts.push(add_inst);
-                (const_insts.to_vec(), MemArg::reg(tmp.to_reg()))
+                (const_insts, MemArg::reg(tmp.to_reg()))
            }
        }
+
        &MemArg::Label(ref label) => {
            let off = memlabel_finalize(insn_off, label);
-            (vec![], MemArg::Label(MemLabel::PCRel(off)))
+            (smallvec![], MemArg::Label(MemLabel::PCRel(off)))
        }
-        _ => (vec![], mem.clone()),
+
+        _ => (smallvec![], mem.clone()),
    }
 }

@ -73,12 +98,12 @@ pub fn u64_constant(bits: u64) -> ConstantData {
 // Instructions and subcomponents: emission

 fn machreg_to_gpr(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::I64);
+    assert_eq!(m.get_class(), RegClass::I64);
    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }

 fn machreg_to_vec(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::V128);
+    assert_eq!(m.get_class(), RegClass::V128);
    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }

@ -137,6 +162,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
 }

+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
 const MOVE_WIDE_FIXED: u32 = 0x92800000;

 #[repr(u32)]
@ -275,8 +308,8 @@ fn enc_ccmp_imm(size: InstSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) ->
 }

 fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 {
-    debug_assert!(!is_16b); // to be supported later.
    0b00001110_101_00000_00011_1_00000_00000
+        | ((is_16b as u32) << 30)
        | machreg_to_vec(rd.to_reg())
        | (machreg_to_vec(rn) << 16)
        | (machreg_to_vec(rn) << 5)
@ -322,8 +355,29 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }

-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, flags: &settings::Flags) {
+fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
+    let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000;
+    bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
+}
+
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    virtual_sp_offset: i64,
+}
+
+impl MachInstEmit for Inst {
+    type State = EmitState;
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@ -596,10 +650,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }

                // ldst encoding helpers take Reg, not Writable<Reg>.
@ -608,17 +662,17 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // This is the base opcode (top 10 bits) for the "unscaled
                // immediate" form (Unscaled). Other addressing modes will OR in
                // other values for bits 24/25 (bits 1/2 of this constant).
-                let op = match self {
-                    &Inst::ULoad8 { .. } => 0b0011100001,
-                    &Inst::SLoad8 { .. } => 0b0011100010,
-                    &Inst::ULoad16 { .. } => 0b0111100001,
-                    &Inst::SLoad16 { .. } => 0b0111100010,
-                    &Inst::ULoad32 { .. } => 0b1011100001,
-                    &Inst::SLoad32 { .. } => 0b1011100010,
-                    &Inst::ULoad64 { .. } => 0b1111100001,
-                    &Inst::FpuLoad32 { .. } => 0b1011110001,
-                    &Inst::FpuLoad64 { .. } => 0b1111110001,
-                    &Inst::FpuLoad128 { .. } => 0b0011110011,
+                let (op, bits) = match self {
+                    &Inst::ULoad8 { .. } => (0b0011100001, 8),
+                    &Inst::SLoad8 { .. } => (0b0011100010, 8),
+                    &Inst::ULoad16 { .. } => (0b0111100001, 16),
+                    &Inst::SLoad16 { .. } => (0b0111100010, 16),
+                    &Inst::ULoad32 { .. } => (0b1011100001, 32),
+                    &Inst::SLoad32 { .. } => (0b1011100010, 32),
+                    &Inst::ULoad64 { .. } => (0b1111100001, 64),
+                    &Inst::FpuLoad32 { .. } => (0b1011110001, 32),
+                    &Inst::FpuLoad64 { .. } => (0b1111110001, 64),
+                    &Inst::FpuLoad128 { .. } => (0b0011110011, 128),
                    _ => unreachable!(),
                };

@ -632,6 +686,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
                    }
                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
                    }
                    &MemArg::RegReg(r1, r2) => {
@ -640,19 +697,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        ));
                    }
                    &MemArg::RegScaled(r1, r2, ty) | &MemArg::RegScaledExtended(r1, r2, ty, _) => {
-                        match (ty, self) {
-                            (I8, &Inst::ULoad8 { .. }) => {}
-                            (I8, &Inst::SLoad8 { .. }) => {}
-                            (I16, &Inst::ULoad16 { .. }) => {}
-                            (I16, &Inst::SLoad16 { .. }) => {}
-                            (I32, &Inst::ULoad32 { .. }) => {}
-                            (I32, &Inst::SLoad32 { .. }) => {}
-                            (I64, &Inst::ULoad64 { .. }) => {}
-                            (F32, &Inst::FpuLoad32 { .. }) => {}
-                            (F64, &Inst::FpuLoad64 { .. }) => {}
-                            (I128, &Inst::FpuLoad128 { .. }) => {}
-                            _ => panic!("Mismatching reg-scaling type in MemArg"),
-                        }
+                        assert_eq!(bits, ty_bits(ty));
                        let extendop = match &mem {
                            &MemArg::RegScaled(..) => None,
                            &MemArg::RegScaledExtended(_, _, _, op) => Some(op),
@ -697,9 +742,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
+                    &MemArg::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
                }
            }

@ -739,20 +785,20 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }

-                let op = match self {
-                    &Inst::Store8 { .. } => 0b0011100000,
-                    &Inst::Store16 { .. } => 0b0111100000,
-                    &Inst::Store32 { .. } => 0b1011100000,
-                    &Inst::Store64 { .. } => 0b1111100000,
-                    &Inst::FpuStore32 { .. } => 0b1011110000,
-                    &Inst::FpuStore64 { .. } => 0b1111110000,
-                    &Inst::FpuStore128 { .. } => 0b0011110010,
+                let (op, bits) = match self {
+                    &Inst::Store8 { .. } => (0b0011100000, 8),
+                    &Inst::Store16 { .. } => (0b0111100000, 16),
+                    &Inst::Store32 { .. } => (0b1011100000, 32),
+                    &Inst::Store64 { .. } => (0b1111100000, 64),
+                    &Inst::FpuStore32 { .. } => (0b1011110000, 32),
+                    &Inst::FpuStore64 { .. } => (0b1111110000, 64),
+                    &Inst::FpuStore128 { .. } => (0b0011110010, 128),
                    _ => unreachable!(),
                };

@ -766,6 +812,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
                    }
                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
                    }
                    &MemArg::RegReg(r1, r2) => {
@ -794,9 +843,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
+                    &MemArg::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
                }
            }

@ -883,6 +933,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
            &Inst::FpuMove64 { rd, rn } => {
                sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
            }
+            &Inst::FpuMove128 { rd, rn } => {
+                sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
+            }
            &Inst::FpuRR { fpu_op, rd, rn } => {
                let top22 = match fpu_op {
                    FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000,
@ -913,6 +966,44 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                };
                sink.put4(enc_fpurrr(top22, rd, rn, rm));
            }
+            &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op {
+                FPUOpRI::UShr32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::UShr64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+            },
            &Inst::FpuRRRR {
                fpu_op,
                rd,
@ -926,6 +1017,15 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                };
                sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
            }
+            &Inst::VecMisc { op, rd, rn, ty } => {
+                let bits_12_16 = match op {
+                    VecMisc2::Not => {
+                        debug_assert_eq!(I8X16, ty);
+                        0b00101
+                    }
+                };
+                sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
+            }
            &Inst::FpuCmp32 { rn, rm } => {
                sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
            }
@ -980,11 +1080,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(8),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put4(const_data.to_bits());
            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
@ -993,13 +1093,29 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data.to_bits());
            }
+            &Inst::LoadFpuConst128 { rd, const_data } => {
+                let inst = Inst::FpuLoad128 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None,
+                };
+                inst.emit(sink, flags, state);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(20),
+                };
+                inst.emit(sink, flags, state);
+
+                for i in const_data.to_le_bytes().iter() {
+                    sink.put1(*i);
+                }
+            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size32));
            }
@ -1033,12 +1149,40 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        | machreg_to_gpr(rd.to_reg()),
                );
            }
-            &Inst::VecRRR { rd, rn, rm, alu_op } => {
+            &Inst::VecRRR {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                ty,
+            } => {
+                let enc_size_for_cmp = match ty {
+                    I8X16 => 0b00,
+                    _ => 0,
+                };
+
                let (top11, bit15_10) = match alu_op {
-                    VecALUOp::SQAddScalar => (0b010_11110_11_1, 0b000011),
-                    VecALUOp::SQSubScalar => (0b010_11110_11_1, 0b001011),
-                    VecALUOp::UQAddScalar => (0b011_11110_11_1, 0b000011),
-                    VecALUOp::UQSubScalar => (0b011_11110_11_1, 0b001011),
+                    VecALUOp::SQAddScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b010_11110_11_1, 0b000011)
+                    }
+                    VecALUOp::SQSubScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b010_11110_11_1, 0b001011)
+                    }
+                    VecALUOp::UQAddScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b011_11110_11_1, 0b000011)
+                    }
+                    VecALUOp::UQSubScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b011_11110_11_1, 0b001011)
+                    }
+                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011),
+                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
                };
                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
            }
@ -1084,7 +1228,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                if top22 != 0 {
                    sink.put4(enc_extend(top22, rd, rn));
                } else {
-                    Inst::mov32(rd, rn).emit(sink, flags);
+                    Inst::mov32(rd, rn).emit(sink, flags, state);
                }
            }
            &Inst::Extend {
@ -1107,7 +1251,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: zero_reg(),
                    rm: rd.to_reg(),
                };
-                sub_inst.emit(sink, flags);
+                sub_inst.emit(sink, flags, state);
            }
            &Inst::Extend {
                rd,
@ -1127,10 +1271,14 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                panic!("Unsupported extend variant");
            }
            &Inst::Jump { ref dest } => {
-                // TODO: differentiate between as_off26() returning `None` for
-                // out-of-range vs. not-yet-finalized. The latter happens when we
-                // do early (fake) emission for size computation.
-                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+                let off = sink.cur_offset();
+                // Indicate that the jump uses a label, if so, so that a fixup can occur later.
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    sink.add_uncond_branch(off, off + 4, l);
+                }
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
            }
            &Inst::Ret => {
                sink.put4(0xd65f03c0);
@ -1138,71 +1286,47 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
            &Inst::EpiloguePlaceholder => {
                // Noop; this is just a placeholder for epilogues.
            }
-            &Inst::Call {
-                ref dest,
-                loc,
-                opcode,
-                ..
-            } => {
-                sink.add_reloc(loc, Reloc::Arm64Call, dest, 0);
+            &Inst::Call { ref info } => {
+                sink.add_reloc(info.loc, Reloc::Arm64Call, &info.dest, 0);
                sink.put4(enc_jump26(0b100101, 0));
-                if opcode.is_call() {
-                    sink.add_call_site(loc, opcode);
+                if info.opcode.is_call() {
+                    sink.add_call_site(info.loc, info.opcode);
                }
            }
-            &Inst::CallInd {
-                rn, loc, opcode, ..
-            } => {
-                sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5));
-                if opcode.is_call() {
-                    sink.add_call_site(loc, opcode);
+            &Inst::CallInd { ref info } => {
+                sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(info.rn) << 5));
+                if info.opcode.is_call() {
+                    sink.add_call_site(info.loc, info.opcode);
                }
            }
-            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
-            &Inst::CondBrLowered { target, kind } => match kind {
-                // TODO: handle >2^19 case by emitting a compound sequence with
-                // an unconditional (26-bit) branch. We need branch-relaxation
-                // adjustment machinery to enable this (because we don't want to
-                // always emit the long form).
-                CondBrKind::Zero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::NotZero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::Cond(c) => {
-                    sink.put4(enc_cbr(
-                        0b01010100,
-                        target.as_off19().unwrap_or(0),
-                        0b0,
-                        c.bits(),
-                    ));
-                }
-            },
-            &Inst::CondBrLoweredCompound {
+            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                // Conditional part first.
-                match kind {
-                    CondBrKind::Zero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::NotZero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::Cond(c) => {
-                        sink.put4(enc_cbr(
-                            0b01010100,
-                            taken.as_off19().unwrap_or(0),
-                            0b0,
-                            c.bits(),
-                        ));
-                    }
+                let cond_off = sink.cur_offset();
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cond_off + 4, l, &inverted[..]);
                }
-                // Unconditional part.
-                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+                sink.put4(enc_conditional_br(taken, kind));
+
+                // Unconditional part next.
+                let uncond_off = sink.cur_offset();
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    sink.add_uncond_branch(uncond_off, uncond_off + 4, l);
+                }
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+            }
+            &Inst::OneWayCondBr { target, kind } => {
+                let off = sink.cur_offset();
+                if let Some(l) = target.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch19);
+                }
+                sink.put4(enc_conditional_br(target, kind));
            }
            &Inst::IndirectBr { rn, .. } => {
                sink.put4(enc_br(rn));
@ -1219,8 +1343,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                sink.add_trap(srcloc, code);
                sink.put4(0xd4a00000);
            }
-            &Inst::Adr { rd, ref label } => {
-                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+            &Inst::Adr { rd, off } => {
                assert!(off > -(1 << 20));
                assert!(off < (1 << 20));
                sink.put4(enc_adr(off, rd));
@ -1235,26 +1358,20 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ridx,
                rtmp1,
                rtmp2,
-                ref targets,
+                ref info,
                ..
            } => {
                // This sequence is *one* instruction in the vcode, and is expanded only here at
                // emission time, because we cannot allow the regalloc to insert spills/reloads in
                // the middle; we depend on hardcoded PC-rel addressing below.
-                //
-                // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.

                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Load address of jump table
-                let inst = Inst::Adr {
-                    rd: rtmp1,
-                    label: MemLabel::PCRel(16),
-                };
-                inst.emit(sink, flags);
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
+                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
                    rd: rtmp2,
@ -1266,7 +1383,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    ),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Add base of jump table to jump-table-sourced block offset
                let inst = Inst::AluRRR {
                    alu_op: ALUOp::Add64,
@ -1274,22 +1391,30 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: rtmp1.to_reg(),
                    rm: rtmp2.to_reg(),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Branch to computed address. (`targets` here is only used for successor queries
                // and is not needed for emission.)
                let inst = Inst::IndirectBr {
                    rn: rtmp1.to_reg(),
                    targets: vec![],
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
-                for target in targets {
-                    let off = target.as_offset_words() * 4;
-                    let off = i32::try_from(off).unwrap();
-                    // cast i32 to u32 (two's-complement)
-                    let off = off as u32;
-                    sink.put4(off);
+                let jt_off = sink.cur_offset();
+                for &target in info.targets.iter() {
+                    let word_off = sink.cur_offset();
+                    let off_into_table = word_off - jt_off;
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
+                    sink.put4(off_into_table);
                }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
            }
            &Inst::LoadConst64 { rd, const_data } => {
                let inst = Inst::ULoad64 {
@ -1297,11 +1422,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data);
            }
            &Inst::LoadExtName {
@ -1315,11 +1440,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
                if flags.emit_all_ones_funcaddrs() {
                    sink.put8(u64::max_value());
@ -1327,53 +1452,82 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.put8(0);
                }
            }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
-                MemArg::FPOffset(fp_off) => {
-                    let alu_op = if fp_off < 0 {
-                        ALUOp::Sub64
-                    } else {
-                        ALUOp::Add64
-                    };
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
-                    {
-                        let inst = Inst::AluRRImm12 {
-                            alu_op,
-                            rd,
-                            imm12,
-                            rn: fp_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    } else {
-                        let const_insts =
-                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
-                        for inst in const_insts {
-                            inst.emit(sink, flags);
-                        }
-                        let inst = Inst::AluRRR {
-                            alu_op,
-                            rd,
-                            rn: fp_reg(),
-                            rm: rd.to_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, flags, state);
                }
-                _ => unimplemented!("{:?}", mem),
-            },
-            &Inst::GetPinnedReg { rd } => {
-                let inst = Inst::Mov {
-                    rd,
-                    rm: xreg(PINNED_REG),
+
+                let (reg, offset) = match mem {
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
+                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                };
-                inst.emit(sink, flags);
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    mov.emit(sink, flags, state);
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    add.emit(sink, flags, state);
+                } else {
+                    // Use `tmp2` here: `reg` may be `spilltmp` if the `MemArg` on this instruction
+                    // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note
+                    // that no other instructions will be inserted here (we're emitting directly),
+                    // and a live range of `tmp2` should not span this instruction, so this use
+                    // should otherwise be correct.
+                    debug_assert!(rd.to_reg() != tmp2_reg());
+                    debug_assert!(reg != tmp2_reg());
+                    let tmp = writable_tmp2_reg();
+                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        insn.emit(sink, flags, state);
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    add.emit(sink, flags, state);
+                }
            }
-            &Inst::SetPinnedReg { rm } => {
-                let inst = Inst::Mov {
-                    rd: Writable::from_reg(xreg(PINNED_REG)),
-                    rm,
-                };
-                inst.emit(sink, flags);
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                debug!(
+                    "virtual sp offset adjusted by {} -> {}",
+                    offset,
+                    state.virtual_sp_offset + offset
+                );
+                state.virtual_sp_offset += offset;
+            }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, flags, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
            }
        }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
    }
 }
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs
@ -3,6 +3,7 @@ use crate::isa::aarch64::inst::*;
 use crate::isa::test_utils;
 use crate::settings;

+use alloc::boxed::Box;
 use alloc::vec::Vec;

 #[test]
@ -1310,38 +1311,68 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(32768),
+            mem: MemArg::FPOffset(32768, I8),
            srcloc: None,
        },
-        "0F0090D2EF011D8BE10140F9",
-        "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]",
+        "100090D2B063308B010240F9",
+        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(-32768),
+            mem: MemArg::FPOffset(-32768, I8),
            srcloc: None,
        },
-        "EFFF8F92EF011D8BE10140F9",
-        "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]",
+        "F0FF8F92B063308B010240F9",
+        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(1048576), // 2^20
+            mem: MemArg::FPOffset(1048576, I8), // 2^20
            srcloc: None,
        },
-        "0F02A0D2EF011D8BE10140F9",
-        "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "1002A0D2B063308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
+            mem: MemArg::FPOffset(1048576 + 1, I8), // 2^20 + 1
            srcloc: None,
        },
-        "2F0080D20F02A0F2EF011D8BE10140F9",
-        "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "300080D21002A0F2B063308B010240F9",
+        "movz x16, #1 ; movk x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 8, I64),
+            srcloc: None,
+        },
+        "E18040F8",
+        "ldur x1, [x7, #8]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 1024, I64),
+            srcloc: None,
+        },
+        "E10042F9",
+        "ldr x1, [x7, #1024]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 1048576, I64),
+            srcloc: None,
+        },
+        "1002A0D2F060308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
    ));

    insns.push((
@ -1801,6 +1832,7 @@ fn test_aarch64_binemit() {
            rn: vreg(22),
            rm: vreg(23),
            alu_op: VecALUOp::UQAddScalar,
+            ty: I64,
        },
        "D50EF77E",
        "uqadd d21, d22, d23",
@ -1811,6 +1843,7 @@ fn test_aarch64_binemit() {
            rn: vreg(22),
            rm: vreg(23),
            alu_op: VecALUOp::SQAddScalar,
+            ty: I64,
        },
        "D50EF75E",
        "sqadd d21, d22, d23",
@ -1821,6 +1854,7 @@ fn test_aarch64_binemit() {
            rn: vreg(22),
            rm: vreg(23),
            alu_op: VecALUOp::UQSubScalar,
+            ty: I64,
        },
        "D52EF77E",
        "uqsub d21, d22, d23",
@ -1831,10 +1865,83 @@ fn test_aarch64_binemit() {
            rn: vreg(22),
            rm: vreg(23),
            alu_op: VecALUOp::SQSubScalar,
+            ty: I64,
        },
        "D52EF75E",
        "sqsub d21, d22, d23",
    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmeq,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            ty: I8X16,
+        },
+        "E38E386E",
+        "cmeq v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmgt,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            ty: I8X16,
+        },
+        "E336384E",
+        "cmgt v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmge,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            rm: vreg(12),
+            ty: I8X16,
+        },
+        "373D2C4E",
+        "cmge v23.16b, v9.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhi,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            ty: I8X16,
+        },
+        "2534216E",
+        "cmhi v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhs,
+            rd: writable_vreg(8),
+            rn: vreg(2),
+            rm: vreg(15),
+            ty: I8X16,
+        },
+        "483C2F6E",
+        "cmhs v8.16b, v2.16b, v15.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd: writable_vreg(2),
+            rn: vreg(1),
+            ty: I8X16,
+        },
+        "2258206E",
+        "mvn v2.16b, v1.16b",
+    ));
+
    insns.push((
        Inst::Extend {
            rd: writable_xreg(1),
@ -1955,7 +2062,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Zero(xreg(8)),
        },
@ -1963,7 +2070,7 @@ fn test_aarch64_binemit() {
        "cbz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::NotZero(xreg(8)),
        },
@ -1971,7 +2078,7 @@ fn test_aarch64_binemit() {
        "cbnz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Eq),
        },
@ -1979,7 +2086,7 @@ fn test_aarch64_binemit() {
        "b.eq 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ne),
        },
@ -1988,7 +2095,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hs),
        },
@ -1996,7 +2103,7 @@ fn test_aarch64_binemit() {
        "b.hs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lo),
        },
@ -2004,7 +2111,7 @@ fn test_aarch64_binemit() {
        "b.lo 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Mi),
        },
@ -2012,7 +2119,7 @@ fn test_aarch64_binemit() {
        "b.mi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Pl),
        },
@ -2020,7 +2127,7 @@ fn test_aarch64_binemit() {
        "b.pl 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vs),
        },
@ -2028,7 +2135,7 @@ fn test_aarch64_binemit() {
        "b.vs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vc),
        },
@ -2036,7 +2143,7 @@ fn test_aarch64_binemit() {
        "b.vc 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hi),
        },
@ -2044,7 +2151,7 @@ fn test_aarch64_binemit() {
        "b.hi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ls),
        },
@ -2052,7 +2159,7 @@ fn test_aarch64_binemit() {
        "b.ls 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ge),
        },
@ -2060,7 +2167,7 @@ fn test_aarch64_binemit() {
        "b.ge 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lt),
        },
@ -2068,7 +2175,7 @@ fn test_aarch64_binemit() {
        "b.lt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Gt),
        },
@ -2076,7 +2183,7 @@ fn test_aarch64_binemit() {
        "b.gt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Le),
        },
@ -2084,7 +2191,7 @@ fn test_aarch64_binemit() {
        "b.le 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Al),
        },
@ -2092,7 +2199,7 @@ fn test_aarch64_binemit() {
        "b.al 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Nv),
        },
@ -2101,7 +2208,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLoweredCompound {
+        Inst::CondBr {
            taken: BranchTarget::ResolvedOffset(64),
            not_taken: BranchTarget::ResolvedOffset(128),
            kind: CondBrKind::Cond(Cond::Le),
@ -2112,11 +2219,13 @@ fn test_aarch64_binemit() {

    insns.push((
        Inst::Call {
-            dest: ExternalName::testcase("test0"),
-            uses: Set::empty(),
-            defs: Set::empty(),
-            loc: SourceLoc::default(),
-            opcode: Opcode::Call,
+            info: Box::new(CallInfo {
+                dest: ExternalName::testcase("test0"),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::Call,
+            }),
        },
        "00000094",
        "bl 0",
@ -2124,11 +2233,13 @@ fn test_aarch64_binemit() {

    insns.push((
        Inst::CallInd {
-            rn: xreg(10),
-            uses: Set::empty(),
-            defs: Set::empty(),
-            loc: SourceLoc::default(),
-            opcode: Opcode::CallIndirect,
+            info: Box::new(CallIndInfo {
+                rn: xreg(10),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::CallIndirect,
+            }),
        },
        "40013FD6",
        "blr x10",
@ -2137,7 +2248,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::IndirectBr {
            rn: xreg(3),
-            targets: vec![1, 2, 3],
+            targets: vec![],
        },
        "60001FD6",
        "br x3",
@ -2148,7 +2259,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::Adr {
            rd: writable_xreg(15),
-            label: MemLabel::PCRel((1 << 20) - 4),
+            off: (1 << 20) - 4,
        },
        "EFFF7F10",
        "adr x15, pc+1048572",
@ -2163,6 +2274,15 @@ fn test_aarch64_binemit() {
        "mov v8.8b, v4.8b",
    ));

+    insns.push((
+        Inst::FpuMove128 {
+            rd: writable_vreg(17),
+            rn: vreg(26),
+        },
+        "511FBA4E",
+        "mov v17.16b, v26.16b",
+    ));
+
    insns.push((
        Inst::FpuRR {
            fpu_op: FPUOp1::Abs32,
@ -2399,6 +2519,46 @@ fn test_aarch64_binemit() {
        "fmadd d15, d30, d31, d1",
    ));

+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204202F",
+        "ushr v2.2s, v5.2s, #32",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204417F",
+        "ushr d2, d5, #63",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44553F2F",
+        "sli v4.2s, v10.2s, #31",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44557F7F",
+        "sli d4, d10, #63",
+    ));
+
    insns.push((
        Inst::FpuToInt {
            op: FpuToIntOp::F32ToU32,
@ -2685,6 +2845,15 @@ fn test_aarch64_binemit() {
        "ldr d16, pc+8 ; b 12 ; data.f64 1",
    ));

+    insns.push((
+        Inst::LoadFpuConst128 {
+            rd: writable_vreg(5),
+            const_data: 0x0f0e0d0c0b0a09080706050403020100,
+        },
+        "4500009C05000014000102030405060708090A0B0C0D0E0F",
+        "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100",
+    ));
+
    insns.push((
        Inst::FpuCSel32 {
            rd: writable_vreg(1),
@ -2791,19 +2960,11 @@ fn test_aarch64_binemit() {
        let actual_printing = insn.show_rru(Some(&rru));
        assert_eq!(expected_printing, actual_printing);

-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
-            code_sec.size()
-        };
-
        let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
    }
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs
@ -106,6 +106,85 @@ impl SImm7Scaled {
    }
 }

+#[derive(Clone, Copy, Debug)]
+pub struct FPULeftShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPULeftShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount < lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert!(self.lane_size_in_bits.is_power_of_two());
+        debug_assert!(self.lane_size_in_bits > self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encode the shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The highest one bit is represented by `lane_size_in_bits`. Since
+        // `lane_size_in_bits` is a power of 2 and `amount` is less
+        // than `lane_size_in_bits`, they can be ORed
+        // together to produced the encoded value.
+        u32::from(self.lane_size_in_bits | self.amount)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct FPURightShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPURightShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount > 0 && amount <= lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert_ne!(0, self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encodes the negated shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The shift amount is negated such that a shift ammount
+        // of 1 (in 64-bit) is encoded as 0b111111 and a shift
+        // amount of 64 is encoded as 0b000000,
+        // in the bottom 6 bits.
+        u32::from((self.lane_size_in_bits * 2) - self.amount)
+    }
+}
+
 /// a 9-bit signed offset.
 #[derive(Clone, Copy, Debug)]
 pub struct SImm9 {
@ -134,6 +213,11 @@ impl SImm9 {
    pub fn bits(&self) -> u32 {
        (self.value as u32) & 0x1ff
    }
+
+    /// Signed value of immediate.
+    pub fn value(&self) -> i32 {
+        self.value as i32
+    }
 }

 /// An unsigned, scaled 12-bit offset.
@ -172,6 +256,16 @@ impl UImm12Scaled {
    pub fn bits(&self) -> u32 {
        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
    }
+
+    /// Value after scaling.
+    pub fn value(&self) -> u32 {
+        self.value as u32
+    }
+
+    /// The value type which is the scaling base.
+    pub fn scale_ty(&self) -> Type {
+        self.scale_ty
+    }
 }

 /// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
@ -566,6 +660,18 @@ impl ShowWithRRU for SImm7Scaled {
    }
 }

+impl ShowWithRRU for FPULeftShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
+impl ShowWithRRU for FPURightShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
 impl ShowWithRRU for SImm9 {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        format!("#{}", self.value)
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs
@ -1,5 +1,6 @@
 //! AArch64 ISA definitions: registers.

+use crate::ir::types::*;
 use crate::isa::aarch64::inst::InstSize;
 use crate::machinst::*;
 use crate::settings;
@ -20,23 +21,21 @@ pub const PINNED_REG: u8 = 21;
 const XREG_INDICES: [u8; 31] = [
    // X0 - X7
    32, 33, 34, 35, 36, 37, 38, 39,
-    // X8 - X14
-    40, 41, 42, 43, 44, 45, 46,
-    // X15
-    59,
+    // X8 - X15
+    40, 41, 42, 43, 44, 45, 46, 47,
    // X16, X17
-    47, 48,
+    58, 59,
    // X18
    60,
    // X19, X20
-    49, 50,
+    48, 49,
    // X21, put aside because it's the pinned register.
-    58,
+    57,
    // X22 - X28
-    51, 52, 53, 54, 55, 56, 57,
-    // X29
+    50, 51, 52, 53, 54, 55, 56,
+    // X29 (FP)
    61,
-    // X30
+    // X30 (LR)
    62,
 ];

@ -125,14 +124,17 @@ pub fn writable_fp_reg() -> Writable<Reg> {
    Writable::from_reg(fp_reg())
 }

-/// Get a reference to the "spill temp" register. This register is used to
-/// compute the address of a spill slot when a direct offset addressing mode from
-/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
-/// and reserve it for this purpose for simplicity; otherwise we need a
-/// multi-stage analysis where we first determine how many spill slots we have,
-/// then perhaps remove the reg from the pool and recompute regalloc.
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
+///
+/// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is
+/// slightly special (used for linker veneers). We're free to use it as long as we don't expect it
+/// to live through call instructions.
 pub fn spilltmp_reg() -> Reg {
-    xreg(15)
+    xreg(16)
 }

 /// Get a writable reference to the spilltmp reg.
@ -140,6 +142,20 @@ pub fn writable_spilltmp_reg() -> Writable<Reg> {
    Writable::from_reg(spilltmp_reg())
 }

+/// Get a reference to the second temp register. We need this in some edge cases
+/// where we need both the spilltmp and another temporary.
+///
+/// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is
+/// free to use otherwise.
+pub fn tmp2_reg() -> Reg {
+    xreg(17)
+}
+
+/// Get a writable reference to the tmp2 reg.
+pub fn writable_tmp2_reg() -> Writable<Reg> {
+    Writable::from_reg(tmp2_reg())
+}
+
 /// Create the register universe for AArch64.
 pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
    let mut regs = vec![];
@ -173,7 +189,7 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {

    for i in 0u8..32u8 {
        // See above for excluded registers.
-        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
+        if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
            continue;
        }
        let reg = Reg::new_real(
@ -191,7 +207,7 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
        first: x_reg_base as usize,
        last: x_reg_last as usize,
-        suggested_scratch: Some(XREG_INDICES[13] as usize),
+        suggested_scratch: Some(XREG_INDICES[19] as usize),
    });
    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
        first: v_reg_base as usize,
@ -211,7 +227,8 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
        regs.len()
    };

-    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(16).to_real_reg(), "x16".to_string()));
+    regs.push((xreg(17).to_real_reg(), "x17".to_string()));
    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
    regs.push((link_reg().to_real_reg(), "lr".to_string()));
@ -259,13 +276,17 @@ pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz
    s
 }

-/// Show a vector register when its use as a 32-bit or 64-bit float is known.
+/// Show a vector register.
 pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
    let mut s = reg.show_rru(mb_rru);
    if reg.get_class() != RegClass::V128 {
        return s;
    }
-    let prefix = if size.is32() { "s" } else { "d" };
+    let prefix = match size {
+        InstSize::Size32 => "s",
+        InstSize::Size64 => "d",
+        InstSize::Size128 => "q",
+    };
    s.replace_range(0..1, prefix);
    s
 }
@ -291,3 +312,17 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
    }
    s
 }
+
+/// Show a vector register.
+pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
+    assert_eq!(RegClass::V128, reg.get_class());
+    let mut s = reg.show_rru(mb_rru);
+
+    match ty {
+        I8X16 => s.push_str(".16b"),
+        F32X2 => s.push_str(".2s"),
+        _ => unimplemented!(),
+    }
+
+    s
+}
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs
@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;

 use super::lower_inst;

+use log::debug;
 use regalloc::{Reg, RegClass, Writable};

 //============================================================================
@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift {
 }

 //============================================================================
-// Instruction input and output "slots".
+// Instruction input "slots".
 //
 // We use these types to refer to operand numbers, and result numbers, together
 // with the associated instruction, in a type-safe way.

-/// Identifier for a particular output of an instruction.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct InsnOutput {
-    pub(crate) insn: IRInst,
-    pub(crate) output: usize,
-}
-
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct InsnInput {
@ -123,95 +118,55 @@ pub(crate) struct InsnInput {
    pub(crate) input: usize,
 }

-/// Producer of a value: either a previous instruction's output, or a register that will be
-/// codegen'd separately.
+/// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum InsnInputSource {
-    Output(InsnOutput),
-    Reg(Reg),
-}
-
-impl InsnInputSource {
-    fn as_output(self) -> Option<InsnOutput> {
-        match self {
-            InsnInputSource::Output(o) => Some(o),
-            _ => None,
-        }
-    }
-}
-
-fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
-    assert!(num <= ctx.num_inputs(output.insn));
-    InsnInput {
-        insn: output.insn,
-        input: num,
-    }
-}
-
-/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
-/// register otherwise.
-fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
-    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
-        let out = InsnOutput {
-            insn: input_inst,
-            output: result_num,
-        };
-        InsnInputSource::Output(out)
-    } else {
-        let reg = ctx.input(input.insn, input.input);
-        InsnInputSource::Reg(reg)
-    }
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
 }

 //============================================================================
-// Lowering: convert instruction outputs to result types.
+// Lowering: convert instruction inputs to forms that we can use.

-/// Lower an instruction output to a 64-bit constant, if possible.
-pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
+}
+
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> Option<ShiftOpShiftImm> {
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
+}
+
+pub(crate) fn output_to_const_f128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    out: InsnOutput,
+) -> Option<u128> {
    if out.output > 0 {
        None
    } else {
        let inst_data = ctx.data(out.insn);
-        if inst_data.opcode() == Opcode::Null {
-            Some(0)
-        } else {
-            match inst_data {
-                &InstructionData::UnaryImm { opcode: _, imm } => {
-                    // Only has Into for i64; we use u64 elsewhere, so we cast.
-                    let imm: i64 = imm.into();
-                    Some(imm as u64)
-                }
-                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
-                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
-                _ => None,
+
+        match inst_data {
+            &InstructionData::UnaryConst {
+                opcode: _,
+                constant_handle,
+            } => {
+                let mut bytes = [0u8; 16];
+                let c = ctx.get_constant_data(constant_handle).clone().into_vec();
+                assert_eq!(c.len(), 16);
+                bytes.copy_from_slice(&c);
+                Some(u128::from_le_bytes(bytes))
            }
+            _ => None,
        }
    }
 }

-pub(crate) fn output_to_const_f32<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f32> {
-    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
-}
-
-pub(crate) fn output_to_const_f64<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f64> {
-    output_to_const(ctx, out).map(|value| f64::from_bits(value))
-}
-
-/// Lower an instruction output to a constant register-shift amount, if possible.
-pub(crate) fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<ShiftOpShiftImm> {
-    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
-}
-
 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
 /// parameter to `input_to_*` below.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -237,9 +192,9 @@ impl NarrowValueMode {
    }
 }

-/// Lower an instruction output to a reg.
+/// Allocate a register for an instruction output and return it.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output)
 }

 /// Lower an instruction input to a reg.
@ -252,13 +207,31 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> Reg {
+    debug!("input_to_reg: input {:?}", input);
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty_bits(ty) as u8;
-    let in_reg = ctx.input(input.insn, input.input);
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        let masked = if from_bits < 64 {
+            c & ((1u64 << from_bits) - 1)
+        } else {
+            c
+        };
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, masked, ty).into_iter() {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
    match (narrow_mode, from_bits) {
        (NarrowValueMode::None, _) => in_reg,
        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@ -269,7 +242,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
            tmp.to_reg()
        }
        (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@ -282,18 +255,23 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,

        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
-            ctx.emit(Inst::Extend {
-                rd: tmp,
-                rn: in_reg,
-                signed: false,
-                from_bits,
-                to_bits: 64,
-            });
-            tmp.to_reg()
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                ctx.emit(Inst::Extend {
+                    rd: tmp,
+                    rn: in_reg,
+                    signed: false,
+                    from_bits,
+                    to_bits: 64,
+                });
+                tmp.to_reg()
+            }
        }
        (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@ -304,6 +282,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
            tmp.to_reg()
        }
        (_, 64) => in_reg,
+        (_, 128) => in_reg,

        _ => panic!(
            "Unsupported input width: input ty {} bits {} mode {:?}",
@ -313,8 +292,6 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 }

 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
-/// This does not actually codegen the source instruction; it just uses the
-/// vreg into which the source instruction will generate its value.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
 /// the high bits clear. For many operations, such as an add/sub/mul or any
@ -330,23 +307,18 @@ fn input_to_rs<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();

        if op == Opcode::Ishl {
-            let shiftee = get_input(ctx, out, 0);
-            let shift_amt = get_input(ctx, out, 1);
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };

            // Can we get the shift amount as an immediate?
-            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
-                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
-                    let reg = input_to_reg(ctx, shiftee, narrow_mode);
-                    ctx.merged(insn);
-                    ctx.merged(shift_amt_out.insn);
-                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
-                }
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
+                let reg = input_to_reg(ctx, shiftee, narrow_mode);
+                return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
            }
        }
    }
@ -364,11 +336,10 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
-        let out_ty = ctx.output_ty(insn, out.output);
+        let out_ty = ctx.output_ty(insn, 0);
        let out_bits = ty_bits(out_ty);

        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
@ -378,7 +349,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
            && ((narrow_mode.is_32bit() && out_bits < 32)
                || (!narrow_mode.is_32bit() && out_bits < 64))
        {
-            let reg = output_to_reg(ctx, out);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            let extendop = match (narrow_mode, out_bits) {
                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
                    ExtendOp::SXTB
@ -402,15 +373,14 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+            return ResultRSE::RegExtend(reg, extendop);
        }

        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
        if op == Opcode::Uextend || op == Opcode::Sextend {
            assert!(out_bits == 32 || out_bits == 64);
            let sign_extend = op == Opcode::Sextend;
-            let extendee = get_input(ctx, out, 0);
-            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_ty = ctx.input_ty(insn, 0);
            let inner_bits = ty_bits(inner_ty);
            assert!(inner_bits < out_bits);
            let extendop = match (sign_extend, inner_bits) {
@ -424,8 +394,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (false, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
-            ctx.merged(insn);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            return ResultRSE::RegExtend(reg, extendop);
        }
    }
@ -438,12 +407,9 @@ pub(crate) fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSEImm12 {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(i) = Imm12::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRSEImm12::Imm12(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return ResultRSEImm12::Imm12(i);
        }
    }

@ -455,14 +421,11 @@ pub(crate) fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSImmLogic {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            let ty = ctx.output_ty(out.insn, out.output);
-            let ty = if ty_bits(ty) < 32 { I32 } else { ty };
-            if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
-                ctx.merged(out.insn);
-                return ResultRSImmLogic::ImmLogic(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
+        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+            return ResultRSImmLogic::ImmLogic(i);
        }
    }

@ -473,12 +436,9 @@ pub(crate) fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
 ) -> ResultRegImmShift {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRegImmShift::ImmShift(immshift);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+            return ResultRegImmShift::ImmShift(immshift);
        }
    }

@ -584,12 +544,10 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
    // mul instructions (Load/StoreComplex don't include scale factors).

-    // Handle one reg and offset that fits in immediate, if possible.
+    // Handle one reg and offset.
    if addends.len() == 1 {
        let reg = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
-        if let Some(memarg) = MemArg::reg_maybe_offset(reg, offset as i64, elem_ty) {
-            return memarg;
-        }
+        return MemArg::RegOffset(reg, offset as i64, elem_ty);
    }

    // Handle two regs and a zero offset, if possible.
@ -600,7 +558,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    }

    // Otherwise, generate add instructions.
-    let addr = ctx.tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);

    // Get the const into a reg.
    lower_constant_u64(ctx, addr.clone(), offset as u64);
@ -612,7 +570,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
        // In an addition, the stack register is the zero register, so divert it to another
        // register just before doing the actual add.
        let reg = if reg == stack_reg() {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            ctx.emit(Inst::Mov {
                rd: tmp,
                rm: stack_reg(),
@ -659,6 +617,14 @@ pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
    ctx.emit(Inst::load_fp_constant64(rd, value));
 }

+pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u128,
+) {
+    ctx.emit(Inst::load_fp_constant128(rd, value));
+}
+
 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
    match cc {
        IntCC::Equal => Cond::Eq,
@ -750,6 +716,7 @@ pub fn ty_bits(ty: Type) -> usize {
        B64 | I64 | F64 => 64,
        B128 | I128 => 128,
        IFLAGS | FFLAGS => 32,
+        I8X16 | B8X16 => 128,
        _ => panic!("ty_bits() on unknown type: {:?}", ty),
    }
 }
@ -757,7 +724,7 @@ pub fn ty_bits(ty: Type) -> usize {
 pub(crate) fn ty_is_int(ty: Type) -> bool {
    match ty {
        B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true,
-        F32 | F64 | B128 | I128 => false,
+        F32 | F64 | B128 | I128 | I8X16 => false,
        IFLAGS | FFLAGS => panic!("Unexpected flags type"),
        _ => panic!("ty_is_int() on unknown type: {:?}", ty),
    }
@ -823,24 +790,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
    }
 }

-/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+/// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
    c: &mut C,
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
        if data.opcode() == op {
-            c.merged(out.insn);
-            return Some(out.insn);
+            return Some(src_inst);
        }
    }
    None
 }

 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
-/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+/// Bint or a bitcast).
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
@ -850,21 +822,19 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    op: Opcode,
    conv: Opcode,
 ) -> Option<IRInst> {
-    if let Some(ret) = maybe_input_insn(c, input, op) {
-        return Some(ret);
-    }
-
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
        if data.opcode() == conv {
-            let conv_insn = out.insn;
-            let conv_input = InsnInput {
-                insn: conv_insn,
-                input: 0,
-            };
-            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
-                c.merged(conv_insn);
-                return Some(inner);
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
            }
        }
    }
@ -876,6 +846,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    insn: IRInst,
    is_signed: bool,
 ) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
    let ty = ctx.input_ty(insn, 0);
    let bits = ty_bits(ty);
    let narrow_mode = match (bits <= 32, is_signed) {
@ -897,6 +868,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    let ty = ctx.input_ty(insn, 0);
    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
    let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
    let rd = writable_zero_reg();
    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
@ -934,17 +906,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
 impl LowerBackend for AArch64Backend {
    type MInst = Inst;

-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst);
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
    }

    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
 }
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs
@ -1,17 +1,20 @@
 //! Lower a single Cranelift instruction into vcode.

+use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::{CodegenError, CodegenResult};

 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;

 use regalloc::RegClass;

+use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::convert::TryFrom;
 use smallvec::SmallVec;
@ -19,7 +22,10 @@ use smallvec::SmallVec;
 use super::lower::*;

 /// Actually codegen an instruction's results into registers.
-pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
    let op = ctx.data(insn).opcode();
    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
        .map(|i| InsnInput { insn, input: i })
@ -35,17 +41,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
-            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let value = ctx.get_constant(insn).unwrap();
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_u64(ctx, rd, value);
        }
        Opcode::F32const => {
-            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f32(ctx, rd, value);
        }
        Opcode::F64const => {
-            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f64(ctx, rd, value);
        }
@ -79,8 +85,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                VecALUOp::UQAddScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@ -91,6 +97,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                rn: va.to_reg(),
                rm: vb.to_reg(),
                alu_op,
+                ty: I64,
            });
            ctx.emit(Inst::MovFromVec64 {
                rd,
@ -110,8 +117,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                VecALUOp::UQSubScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@ -122,6 +129,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                rn: va.to_reg(),
                rm: vb.to_reg(),
                alu_op,
+                ty: I64,
            });
            ctx.emit(Inst::MovFromVec64 {
                rd,
@ -271,7 +279,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                // Check for divide by 0.
                let branch_size = 8;
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(branch_size),
                    kind: CondBrKind::NotZero(rm),
                });
@ -297,7 +305,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 20;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::Zero(rm),
                    });
@ -324,7 +332,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        nzcv: NZCV::new(false, false, false, false),
                        cond: Cond::Eq,
                    });
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(12),
                        kind: CondBrKind::Cond(Cond::Vc),
                    });
@ -337,7 +345,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 8;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::NotZero(rm),
                    });
@ -493,7 +501,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
                            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
                            ctx.emit(Inst::AluRRR {
                                alu_op,
                                rd: tmp,
@ -516,7 +524,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                            // Really ty_bits_size - rn, but the upper bits of the result are
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
-                            let tmp = ctx.tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                            ctx.emit(Inst::AluRRR {
                                alu_op: ALUOp::Sub32,
                                rd: tmp,
@ -529,7 +537,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        };

                        // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmLogic {
                            alu_op: ALUOp::And32,
                            rd: tmp_masked_rm,
@ -538,8 +546,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        });
                        let tmp_masked_rm = tmp_masked_rm.to_reg();

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImm12 {
                            alu_op: ALUOp::Sub32,
                            rd: tmp1,
@ -578,7 +586,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        }
                        immshift.imm &= ty_bits_size - 1;

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmShift {
                            alu_op: ALUOp::Lsr32,
                            rd: tmp1,
@ -683,7 +691,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // and fix the sequence below to work properly for this.
            let narrow_mode = NarrowValueMode::ZeroExtend64;
            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);

            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
            // the rest of the code is identical to the 64-bit version.
@ -870,6 +878,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc },
                (64, _, false) => Inst::ULoad64 { rd, mem, srcloc },
                (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc },
+                (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc },
                _ => panic!("Unsupported size in load"),
            });
        }
@ -909,6 +918,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                (32, true) => Inst::FpuStore32 { rd, mem, srcloc },
                (64, false) => Inst::Store64 { rd, mem, srcloc },
                (64, true) => Inst::FpuStore64 { rd, mem, srcloc },
+                (128, _) => Inst::FpuStore128 { rd, mem, srcloc },
                _ => panic!("Unsupported size in store"),
            });
        }
@ -992,7 +1002,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        }

        Opcode::Bitselect => {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            let rd = output_to_reg(ctx, outputs[0]);
            let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@ -1145,12 +1155,66 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                (false, true) => NarrowValueMode::SignExtend64,
                (false, false) => NarrowValueMode::ZeroExtend64,
            };
-            let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
-            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
-            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
-            ctx.emit(Inst::CondSet { cond, rd });
+
+            if ty_bits(ty) < 128 {
+                let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+                let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+                let rd = output_to_reg(ctx, outputs[0]);
+                ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
+                ctx.emit(Inst::CondSet { cond, rd });
+            } else {
+                if ty != I8X16 {
+                    return Err(CodegenError::Unsupported(format!(
+                        "unsupported simd type: {:?}",
+                        ty
+                    )));
+                }
+
+                let mut rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                let mut rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                let rd = output_to_reg(ctx, outputs[0]);
+
+                // 'Less than' operations are implemented by swapping
+                // the order of operands and using the 'greater than'
+                // instructions.
+                // 'Not equal' is implemented with 'equal' and inverting
+                // the result.
+                let (alu_op, swap) = match cond {
+                    Cond::Eq => (VecALUOp::Cmeq, false),
+                    Cond::Ne => (VecALUOp::Cmeq, false),
+                    Cond::Ge => (VecALUOp::Cmge, false),
+                    Cond::Gt => (VecALUOp::Cmgt, false),
+                    Cond::Le => (VecALUOp::Cmge, true),
+                    Cond::Lt => (VecALUOp::Cmgt, true),
+                    Cond::Hs => (VecALUOp::Cmhs, false),
+                    Cond::Hi => (VecALUOp::Cmhi, false),
+                    Cond::Ls => (VecALUOp::Cmhs, true),
+                    Cond::Lo => (VecALUOp::Cmhi, true),
+                    _ => unreachable!(),
+                };
+
+                if swap {
+                    std::mem::swap(&mut rn, &mut rm);
+                }
+
+                ctx.emit(Inst::VecRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    ty,
+                });
+
+                if cond == Cond::Ne {
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Not,
+                        rd,
+                        rn: rd.to_reg(),
+                        ty: I8X16,
+                    });
+                }
+            }
        }

        Opcode::Fcmp => {
@ -1188,7 +1252,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::Trapif | Opcode::Trapff => {
            let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap());

-            let cond = if op == Opcode::Trapif {
+            let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
+                let condcode = inst_condcode(ctx.data(insn)).unwrap();
+                let cond = lower_condcode(condcode);
+                // The flags must not have been clobbered by any other
+                // instruction between the iadd_ifcout and this instruction, as
+                // verified by the CLIF validator; so we can simply use the
+                // flags here.
+                cond
+            } else if op == Opcode::Trapif {
                let condcode = inst_condcode(ctx.data(insn)).unwrap();
                let cond = lower_condcode(condcode);
                let is_signed = condcode_is_signed(condcode);
@ -1211,7 +1283,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // Branch around the break instruction with inverted cond. Go straight to lowered
            // one-target form; this is logically part of a single-in single-out template lowering.
            let cond = cond.invert();
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(cond),
            });
@ -1233,11 +1305,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::FuncAddr => {
            let rd = output_to_reg(ctx, outputs[0]);
-            let extname = ctx.call_target(insn).unwrap().clone();
+            let (extname, _) = ctx.call_target(insn).unwrap();
+            let extname = extname.clone();
            let loc = ctx.srcloc(insn);
            ctx.emit(Inst::LoadExtName {
                rd,
-                name: extname,
+                name: Box::new(extname),
                srcloc: loc,
                offset: 0,
            });
@ -1249,12 +1322,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::SymbolValue => {
            let rd = output_to_reg(ctx, outputs[0]);
-            let (extname, offset) = ctx.symbol_value(insn).unwrap();
+            let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
            let extname = extname.clone();
            let loc = ctx.srcloc(insn);
            ctx.emit(Inst::LoadExtName {
                rd,
-                name: extname,
+                name: Box::new(extname),
                srcloc: loc,
                offset,
            });
@ -1262,54 +1335,50 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::Call | Opcode::CallIndirect => {
            let loc = ctx.srcloc(insn);
-            let (abi, inputs) = match op {
+            let (mut abi, inputs) = match op {
                Opcode::Call => {
-                    let extname = ctx.call_target(insn).unwrap();
+                    let (extname, dist) = ctx.call_target(insn).unwrap();
                    let extname = extname.clone();
                    let sig = ctx.call_sig(insn).unwrap();
                    assert!(inputs.len() == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
-                    (AArch64ABICall::from_func(sig, &extname, loc), &inputs[..])
+                    (
+                        AArch64ABICall::from_func(sig, &extname, dist, loc)?,
+                        &inputs[..],
+                    )
                }
                Opcode::CallIndirect => {
                    let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
                    let sig = ctx.call_sig(insn).unwrap();
                    assert!(inputs.len() - 1 == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
-                    (AArch64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..])
+                    (AArch64ABICall::from_ptr(sig, ptr, loc, op)?, &inputs[1..])
                }
                _ => unreachable!(),
            };

-            for inst in abi.gen_stack_pre_adjust().into_iter() {
-                ctx.emit(inst);
-            }
+            abi.emit_stack_pre_adjust(ctx);
            assert!(inputs.len() == abi.num_args());
            for (i, input) in inputs.iter().enumerate() {
                let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                for inst in abi.gen_copy_reg_to_arg(ctx, i, arg_reg) {
-                    ctx.emit(inst);
-                }
-            }
-            for inst in abi.gen_call().into_iter() {
-                ctx.emit(inst);
+                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
            }
+            abi.emit_call(ctx);
            for (i, output) in outputs.iter().enumerate() {
                let retval_reg = output_to_reg(ctx, *output);
-                ctx.emit(abi.gen_copy_retval_to_reg(i, retval_reg));
-            }
-            for inst in abi.gen_stack_post_adjust().into_iter() {
-                ctx.emit(inst);
+                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
            }
+            abi.emit_stack_post_adjust(ctx);
        }

        Opcode::GetPinnedReg => {
            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(Inst::GetPinnedReg { rd });
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
        }
+
        Opcode::SetPinnedReg => {
            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::SetPinnedReg { rm });
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
        }

        Opcode::Spill
@ -1340,8 +1409,20 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            panic!("Branch opcode reached non-branch lowering logic!");
        }

-        Opcode::Vconst
-        | Opcode::Shuffle
+        Opcode::Vconst => {
+            let value = output_to_const_f128(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant_f128(ctx, rd, value);
+        }
+
+        Opcode::RawBitcast => {
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rm, ty));
+        }
+
+        Opcode::Shuffle
        | Opcode::Vsplit
        | Opcode::Vconcat
        | Opcode::Vselect
@ -1350,15 +1431,20 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        | Opcode::Splat
        | Opcode::Insertlane
        | Opcode::Extractlane
-        | Opcode::RawBitcast
        | Opcode::ScalarToVector
        | Opcode::Swizzle
        | Opcode::Uload8x8
+        | Opcode::Uload8x8Complex
        | Opcode::Sload8x8
+        | Opcode::Sload8x8Complex
        | Opcode::Uload16x4
+        | Opcode::Uload16x4Complex
        | Opcode::Sload16x4
+        | Opcode::Sload16x4Complex
        | Opcode::Uload32x2
-        | Opcode::Sload32x2 => {
+        | Opcode::Uload32x2Complex
+        | Opcode::Sload32x2
+        | Opcode::Sload32x2Complex => {
            // TODO
            panic!("Vector ops not implemented.");
        }
@ -1452,54 +1538,38 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::Fcopysign => {
            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
            //
-            // (64 bits for example, 32-bit sequence is analogous):
+            // This is a scalar Fcopysign.
+            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
            //
-            // MOV Xtmp1, Dinput0
-            // MOV Xtmp2, Dinput1
-            // AND Xtmp2, 0x8000_0000_0000_0000
-            // BIC Xtmp1, 0x8000_0000_0000_0000
-            // ORR Xtmp1, Xtmp1, Xtmp2
-            // MOV Doutput, Xtmp1
+            //  mov vd, vn
+            //  ushr vtmp, vm, #63 / #31
+            //  sli vd, vtmp, #63 / #31

            let ty = ctx.output_ty(insn, 0);
-            let bits = ty_bits(ty);
+            let bits = ty_bits(ty) as u8;
            assert!(bits == 32 || bits == 64);
            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.tmp(RegClass::I64, I64);
-            let tmp2 = ctx.tmp(RegClass::I64, I64);
-            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
-            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
-            let imml = if bits == 32 {
-                ImmLogic::maybe_from_u64(0x8000_0000, I32).unwrap()
-            } else {
-                ImmLogic::maybe_from_u64(0x8000_0000_0000_0000, I64).unwrap()
-            };
-            let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64);
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op,
-                rd: tmp2,
-                rn: tmp2.to_reg(),
-                imml: imml.clone(),
+            let tmp = ctx.alloc_tmp(RegClass::V128, F64);
+
+            // Copy LHS to rd.
+            ctx.emit(Inst::FpuMove64 { rd, rn });
+
+            // Copy the sign bit to the lowest bit in tmp.
+            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
+                rd: tmp,
+                rn: rm,
            });
-            let alu_op = choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64);
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op,
-                rd: tmp1,
-                rn: tmp1.to_reg(),
-                imml,
-            });
-            let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64);
-            ctx.emit(Inst::AluRRR {
-                alu_op,
-                rd: tmp1,
-                rn: tmp1.to_reg(),
-                rm: tmp2.to_reg(),
-            });
-            ctx.emit(Inst::MovToVec64 {
+
+            // Insert the bit from tmp into the sign bit of rd.
+            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
                rd,
-                rn: tmp1.to_reg(),
+                rn: tmp.to_reg(),
            });
        }

@ -1531,14 +1601,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
            }
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)),
            });
            let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
            ctx.emit(Inst::Udf { trap_info });

-            let tmp = ctx.tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);

            // Check that the input is in range, with "truncate towards zero" semantics. This means
            // we allow values that are slightly out of range:
@ -1572,7 +1642,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@ -1585,7 +1655,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@ -1615,7 +1685,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@ -1628,7 +1698,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@ -1704,8 +1774,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                _ => unreachable!(),
            };

-            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
-            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);

            if in_bits == 32 {
                ctx.emit(Inst::LoadFpuConst32 {
@ -1790,6 +1860,35 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            });
        }

+        Opcode::IaddIfcout => {
+            // This is a two-output instruction that is needed for the
+            // legalizer's explicit heap-check sequence, among possible other
+            // uses. Its second output is a flags output only ever meant to
+            // check for overflow using the
+            // `backend.unsigned_add_overflow_condition()` condition.
+            //
+            // Note that the CLIF validation will ensure that no flag-setting
+            // operation comes between this IaddIfcout and its use (e.g., a
+            // Trapif). Thus, we can rely on implicit communication through the
+            // processor flags rather than explicitly generating flags into a
+            // register. We simply use the variant of the add instruction that
+            // sets flags (`adds`) here.
+
+            // Ensure that the second output isn't directly called for: it
+            // should only be used by a flags-consuming op, which will directly
+            // understand this instruction and merge the comparison.
+            assert!(!ctx.is_reg_needed(insn, ctx.get_output(insn, 1).to_reg()));
+
+            // Now handle the iadd as above, except use an AddS opcode that sets
+            // flags.
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
+            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+        }
+
        Opcode::IaddImm
        | Opcode::ImulImm
        | Opcode::UdivImm
@ -1800,7 +1899,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        | Opcode::IaddCin
        | Opcode::IaddIfcin
        | Opcode::IaddCout
-        | Opcode::IaddIfcout
        | Opcode::IaddCarry
        | Opcode::IaddIfcarry
        | Opcode::IsubBin
@ -1849,6 +1947,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        | Opcode::X86Pmaxu
        | Opcode::X86Pmins
        | Opcode::X86Pminu
+        | Opcode::X86Pmullq
+        | Opcode::X86Pmuludq
        | Opcode::X86Packss
        | Opcode::X86Punpckh
        | Opcode::X86Punpckl
@ -1860,14 +1960,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::AvgRound => unimplemented!(),
        Opcode::TlsValue => unimplemented!(),
    }
+
+    Ok(())
 }

 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    branches: &[IRInst],
-    targets: &[BlockIndex],
-    fallthrough: Option<BlockIndex>,
-) {
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
    // A block should end with at most two branches. The first may be a
    // conditional branch; a conditional branch can be followed only by an
    // unconditional branch or fallthrough. Otherwise, if only one branch,
@ -1881,18 +1983,14 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
        let op0 = ctx.data(branches[0]).opcode();
        let op1 = ctx.data(branches[1]).opcode();

-        //println!(
-        //    "lowering two-branch group: opcodes are {:?} and {:?}",
-        //    op0, op1
-        //);
-
        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-        let taken = BranchTarget::Block(targets[0]);
+        let taken = BranchTarget::Label(targets[0]);
        let not_taken = match op1 {
-            Opcode::Jump => BranchTarget::Block(targets[1]),
-            Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
            _ => unreachable!(), // assert above.
        };
+
        match op0 {
            Opcode::Brz | Opcode::Brnz => {
                let flag_input = InsnInput {
@ -1952,6 +2050,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::BrIcmp => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let ty = ctx.input_ty(branches[0], 0);
                let bits = ty_bits(ty);
@ -1984,13 +2084,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::CondBr {
                    taken,
                    not_taken,
-                    kind: CondBrKind::Cond(cond),
+                    kind,
                });
            }

            Opcode::Brif => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let flag_input = InsnInput {
                    insn: branches[0],
@ -2001,7 +2103,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ifcmp result is actually placed in a
@ -2011,7 +2113,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@ -2019,6 +2121,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::Brff => {
                let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
                let flag_input = InsnInput {
                    insn: branches[0],
                    input: 0,
@ -2028,7 +2131,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ffcmp result is actually placed in a
@ -2038,7 +2141,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@ -2055,12 +2158,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                // fills in `targets[0]` with our fallthrough block, so this
                // is valid for both Jump and Fallthrough.
                ctx.emit(Inst::Jump {
-                    dest: BranchTarget::Block(targets[0]),
+                    dest: BranchTarget::Label(targets[0]),
                });
            }
            Opcode::BrTable => {
                // Expand `br_table index, default, JT` to:
                //
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
                //   subs idx, #jt_size
                //   b.hs default
                //   adr vTmp1, PC+16
@ -2070,6 +2176,11 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                //   [jumptable offsets relative to JT base]
                let jt_size = targets.len() - 1;
                assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
                let ridx = input_to_reg(
                    ctx,
                    InsnInput {
@ -2079,8 +2190,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    NarrowValueMode::ZeroExtend32,
                );

-                let rtmp1 = ctx.tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);

                // Bounds-check and branch to default.
                if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
@ -2099,10 +2210,10 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                        rm: rtmp1.to_reg(),
                    });
                }
-                let default_target = BranchTarget::Block(targets[0]);
-                ctx.emit(Inst::CondBrLowered {
-                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                let default_target = BranchTarget::Label(targets[0]);
+                ctx.emit(Inst::OneWayCondBr {
                    target: default_target.clone(),
+                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
                });

                // Emit the compound instruction that does:
@ -2123,19 +2234,23 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                let jt_targets: Vec<BranchTarget> = targets
                    .iter()
                    .skip(1)
-                    .map(|bix| BranchTarget::Block(*bix))
+                    .map(|bix| BranchTarget::Label(*bix))
                    .collect();
-                let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
                ctx.emit(Inst::JTSequence {
                    ridx,
                    rtmp1,
                    rtmp2,
-                    targets: jt_targets,
-                    targets_for_term,
+                    info: Box::new(JTSequenceInfo {
+                        targets: jt_targets,
+                        targets_for_term: targets_for_term,
+                    }),
                });
            }

            _ => panic!("Unknown branch type!"),
        }
    }
+
+    Ok(())
 }
--- a/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs
@ -1,5 +1,6 @@
 //! ARM 64-bit Instruction Set Architecture.

+use crate::ir::condcodes::IntCC;
 use crate::ir::Function;
 use crate::isa::Builder as IsaBuilder;
 use crate::machinst::{
@ -15,7 +16,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple};

 // New backend:
 mod abi;
-mod inst;
+pub(crate) mod inst;
 mod lower;
 mod lower_inst;

@ -25,12 +26,18 @@ use inst::create_reg_universe;
 pub struct AArch64Backend {
    triple: Triple,
    flags: settings::Flags,
+    reg_universe: RealRegUniverse,
 }

 impl AArch64Backend {
    /// Create a new AArch64 backend with the given (shared) flags.
    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend {
-        AArch64Backend { triple, flags }
+        let reg_universe = create_reg_universe(&flags);
+        AArch64Backend {
+            triple,
+            flags,
+            reg_universe,
+        }
    }

    /// This performs lowering to VCode, register-allocates the code, computes block layout and
@ -40,7 +47,7 @@ impl AArch64Backend {
        func: &Function,
        flags: settings::Flags,
    ) -> CodegenResult<VCode<inst::Inst>> {
-        let abi = Box::new(abi::AArch64ABIBody::new(func, flags));
+        let abi = Box::new(abi::AArch64ABIBody::new(func, flags)?);
        compile::compile::<AArch64Backend>(func, self, abi)
    }
 }
@ -53,7 +60,7 @@ impl MachBackend for AArch64Backend {
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
        let frame_size = vcode.frame_size();

        let disasm = if want_disasm {
@ -62,8 +69,10 @@ impl MachBackend for AArch64Backend {
            None
        };

+        let buffer = buffer.finish();
+
        Ok(MachCompileResult {
-            sections,
+            buffer,
            frame_size,
            disasm,
        })
@ -81,8 +90,21 @@ impl MachBackend for AArch64Backend {
        &self.flags
    }

-    fn reg_universe(&self) -> RealRegUniverse {
-        create_reg_universe(&self.flags)
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Unsigned `>=`; this corresponds to the carry flag set on aarch64, which happens on
+        // overflow of an add.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // unsigned `<`; this corresponds to the carry flag cleared on aarch64, which happens on
+        // underflow of a subtract (aarch64 follows a carry-cleared-on-borrow convention, the
+        // opposite of x86).
+        IntCC::UnsignedLessThan
    }
 }

@ -134,8 +156,8 @@ mod test {
            Triple::from_str("aarch64").unwrap(),
            settings::Flags::new(shared_flags),
        );
-        let sections = backend.compile_function(&mut func, false).unwrap().sections;
-        let code = &sections.sections[0].data;
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];

        // stp x29, x30, [sp, #-16]!
        // mov x29, sp
@ -149,7 +171,7 @@ mod test {
            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }

    #[test]
@ -192,34 +214,32 @@ mod test {
        let result = backend
            .compile_function(&mut func, /* want_disasm = */ false)
            .unwrap();
-        let code = &result.sections.sections[0].data;
+        let code = &result.buffer.data[..];

        // stp	x29, x30, [sp, #-16]!
        // mov	x29, sp
-        // mov	x1, x0
-        // mov  x0, #0x1234
-        // add	w1, w1, w0
-        // mov	w2, w1
-        // cbz	x2, ...
-        // mov	w2, w1
-        // cbz	x2, ...
-        // sub	w0, w1, w0
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
        // mov	sp, x29
        // ldp	x29, x30, [sp], #16
        // ret
-        // add	w2, w1, w0
-        // mov	w2, w2
-        // cbnz	x2, ... <---- compound branch (cond / uncond)
-        // b ...        <----
-
        let golden = vec![
-            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
-            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
-            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
-            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
-            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }
 }
--- a/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs
@ -17,6 +17,7 @@ use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
 use alloc::borrow::Cow;
 use alloc::boxed::Box;
+use core::any::Any;
 use core::fmt;
 use target_lexicon::{Architecture, Triple};

@ -135,6 +136,10 @@ impl TargetIsa for Isa {
    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
        ir::condcodes::IntCC::UnsignedGreaterThanOrEqual
    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
 }

 impl fmt::Display for Isa {
--- a/third_party/rust/cranelift-codegen/src/isa/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/mod.rs
@ -66,6 +66,7 @@ use crate::settings::SetResult;
 use crate::timing;
 use alloc::borrow::Cow;
 use alloc::boxed::Box;
+use core::any::Any;
 use core::fmt;
 use core::fmt::{Debug, Formatter};
 use target_lexicon::{triple, Architecture, PointerWidth, Triple};
@ -77,11 +78,14 @@ mod riscv;
 #[cfg(feature = "x86")]
 mod x86;

+#[cfg(feature = "x64")]
+mod x64;
+
 #[cfg(feature = "arm32")]
 mod arm32;

 #[cfg(feature = "arm64")]
-mod aarch64;
+pub(crate) mod aarch64;

 #[cfg(feature = "unwind")]
 pub mod unwind;
@ -419,6 +423,10 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
        None
    }
+
+    /// Return an [Any] reference for downcasting to the ISA-specific implementation of this trait
+    /// with `isa.as_any().downcast_ref::<isa::foo::Isa>()`.
+    fn as_any(&self) -> &dyn Any;
 }

 impl Debug for &dyn TargetIsa {
--- a/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs
@ -17,6 +17,7 @@ use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
 use alloc::borrow::Cow;
 use alloc::boxed::Box;
+use core::any::Any;
 use core::fmt;
 use target_lexicon::{PointerWidth, Triple};

@ -130,6 +131,10 @@ impl TargetIsa for Isa {
    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
        unimplemented!()
    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
 }

 #[cfg(test)]
@ -163,7 +168,7 @@ mod tests {
        let arg32 = func.dfg.append_block_param(block, types::I32);

        // Try to encode iadd_imm.i64 v1, -10.
-        let inst64 = InstructionData::BinaryImm {
+        let inst64 = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg64,
            imm: immediates::Imm64::new(-10),
@ -176,7 +181,7 @@ mod tests {
        );

        // Try to encode iadd_imm.i64 v1, -10000.
-        let inst64_large = InstructionData::BinaryImm {
+        let inst64_large = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg64,
            imm: immediates::Imm64::new(-10000),
@ -186,7 +191,7 @@ mod tests {
        assert!(isa.encode(&func, &inst64_large, types::I64).is_err());

        // Create an iadd_imm.i32 which is encodable in RV64.
-        let inst32 = InstructionData::BinaryImm {
+        let inst32 = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg32,
            imm: immediates::Imm64::new(10),
@ -214,7 +219,7 @@ mod tests {
        let arg32 = func.dfg.append_block_param(block, types::I32);

        // Try to encode iadd_imm.i64 v1, -10.
-        let inst64 = InstructionData::BinaryImm {
+        let inst64 = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg64,
            imm: immediates::Imm64::new(-10),
@ -224,7 +229,7 @@ mod tests {
        assert!(isa.encode(&func, &inst64, types::I64).is_err());

        // Try to encode iadd_imm.i64 v1, -10000.
-        let inst64_large = InstructionData::BinaryImm {
+        let inst64_large = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg64,
            imm: immediates::Imm64::new(-10000),
@ -234,7 +239,7 @@ mod tests {
        assert!(isa.encode(&func, &inst64_large, types::I64).is_err());

        // Create an iadd_imm.i32 which is encodable in RV32.
-        let inst32 = InstructionData::BinaryImm {
+        let inst32 = InstructionData::BinaryImm64 {
            opcode: Opcode::IaddImm,
            arg: arg32,
            imm: immediates::Imm64::new(10),
--- a/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs
@ -8,7 +8,6 @@ use thiserror::Error;
 use serde::{Deserialize, Serialize};

 type Register = u16;
-type Expression = Vec<u8>;

 /// Enumerate the errors possible in mapping Cranelift registers to their DWARF equivalent.
 #[allow(missing_docs)]
@ -23,6 +22,8 @@ pub enum RegisterMappingError {
 }

 // This mirrors gimli's CallFrameInstruction, but is serializable
+// This excludes CfaExpression, Expression, ValExpression due to
+// https://github.com/gimli-rs/gimli/issues/513.
 // TODO: if gimli ever adds serialization support, remove this type
 #[derive(Clone, Debug, PartialEq, Eq)]
 #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
@ -30,15 +31,12 @@ pub(crate) enum CallFrameInstruction {
    Cfa(Register, i32),
    CfaRegister(Register),
    CfaOffset(i32),
-    CfaExpression(Expression),
    Restore(Register),
    Undefined(Register),
    SameValue(Register),
    Offset(Register, i32),
    ValOffset(Register, i32),
    Register(Register, Register),
-    Expression(Register, Expression),
-    ValExpression(Register, Expression),
    RememberState,
    RestoreState,
    ArgsSize(u32),
@ -52,34 +50,33 @@ impl From<gimli::write::CallFrameInstruction> for CallFrameInstruction {
            CallFrameInstruction::Cfa(reg, offset) => Self::Cfa(reg.0, offset),
            CallFrameInstruction::CfaRegister(reg) => Self::CfaRegister(reg.0),
            CallFrameInstruction::CfaOffset(offset) => Self::CfaOffset(offset),
-            CallFrameInstruction::CfaExpression(expr) => Self::CfaExpression(expr.0),
            CallFrameInstruction::Restore(reg) => Self::Restore(reg.0),
            CallFrameInstruction::Undefined(reg) => Self::Undefined(reg.0),
            CallFrameInstruction::SameValue(reg) => Self::SameValue(reg.0),
            CallFrameInstruction::Offset(reg, offset) => Self::Offset(reg.0, offset),
            CallFrameInstruction::ValOffset(reg, offset) => Self::ValOffset(reg.0, offset),
            CallFrameInstruction::Register(reg1, reg2) => Self::Register(reg1.0, reg2.0),
-            CallFrameInstruction::Expression(reg, expr) => Self::Expression(reg.0, expr.0),
-            CallFrameInstruction::ValExpression(reg, expr) => Self::ValExpression(reg.0, expr.0),
            CallFrameInstruction::RememberState => Self::RememberState,
            CallFrameInstruction::RestoreState => Self::RestoreState,
            CallFrameInstruction::ArgsSize(size) => Self::ArgsSize(size),
+            _ => {
+                // Cranelift's unwind support does not generate `CallFrameInstruction`s with
+                // Expression at this moment, and it is not trivial to
+                // serialize such instructions.
+                panic!("CallFrameInstruction with Expression not supported");
+            }
        }
    }
 }

 impl Into<gimli::write::CallFrameInstruction> for CallFrameInstruction {
    fn into(self) -> gimli::write::CallFrameInstruction {
-        use gimli::{
-            write::{CallFrameInstruction, Expression},
-            Register,
-        };
+        use gimli::{write::CallFrameInstruction, Register};

        match self {
            Self::Cfa(reg, offset) => CallFrameInstruction::Cfa(Register(reg), offset),
            Self::CfaRegister(reg) => CallFrameInstruction::CfaRegister(Register(reg)),
            Self::CfaOffset(offset) => CallFrameInstruction::CfaOffset(offset),
-            Self::CfaExpression(expr) => CallFrameInstruction::CfaExpression(Expression(expr)),
            Self::Restore(reg) => CallFrameInstruction::Restore(Register(reg)),
            Self::Undefined(reg) => CallFrameInstruction::Undefined(Register(reg)),
            Self::SameValue(reg) => CallFrameInstruction::SameValue(Register(reg)),
@ -88,12 +85,6 @@ impl Into<gimli::write::CallFrameInstruction> for CallFrameInstruction {
            Self::Register(reg1, reg2) => {
                CallFrameInstruction::Register(Register(reg1), Register(reg2))
            }
-            Self::Expression(reg, expr) => {
-                CallFrameInstruction::Expression(Register(reg), Expression(expr))
-            }
-            Self::ValExpression(reg, expr) => {
-                CallFrameInstruction::ValExpression(Register(reg), Expression(expr))
-            }
            Self::RememberState => CallFrameInstruction::RememberState,
            Self::RestoreState => CallFrameInstruction::RestoreState,
            Self::ArgsSize(size) => CallFrameInstruction::ArgsSize(size),
--- a/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
@ -0,0 +1,467 @@
+//! Implementation of the standard x64 ABI.
+
+use alloc::vec::Vec;
+use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
+
+use crate::ir::{self, types, types::*, ArgumentExtension, StackSlot, Type};
+use crate::isa::{self, x64::inst::*};
+use crate::machinst::*;
+use crate::settings;
+
+use args::*;
+
+#[derive(Clone, Debug)]
+enum ABIArg {
+    Reg(RealReg),
+    _Stack,
+}
+
+#[derive(Clone, Debug)]
+enum ABIRet {
+    Reg(RealReg),
+    _Stack,
+}
+
+pub(crate) struct X64ABIBody {
+    args: Vec<ABIArg>,
+    rets: Vec<ABIRet>,
+
+    /// Offsets to each stack slot.
+    _stack_slots: Vec<usize>,
+
+    /// Total stack size of all the stack slots.
+    stack_slots_size: usize,
+
+    /// Clobbered registers, as indicated by regalloc.
+    clobbered: Set<Writable<RealReg>>,
+
+    /// Total number of spill slots, as indicated by regalloc.
+    num_spill_slots: Option<usize>,
+
+    /// Calculated while creating the prologue, and used when creating the epilogue. Amount by
+    /// which RSP is adjusted downwards to allocate the spill area.
+    frame_size_bytes: Option<usize>,
+
+    call_conv: isa::CallConv,
+
+    /// The settings controlling this function's compilation.
+    flags: settings::Flags,
+}
+
+fn in_int_reg(ty: types::Type) -> bool {
+    match ty {
+        types::I8
+        | types::I16
+        | types::I32
+        | types::I64
+        | types::B1
+        | types::B8
+        | types::B16
+        | types::B32
+        | types::B64 => true,
+        _ => false,
+    }
+}
+
+fn get_intreg_for_arg_systemv(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::rdi()),
+        1 => Some(regs::rsi()),
+        2 => Some(regs::rdx()),
+        3 => Some(regs::rcx()),
+        4 => Some(regs::r8()),
+        5 => Some(regs::r9()),
+        _ => None,
+    }
+}
+
+fn get_intreg_for_retval_systemv(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::rax()),
+        1 => Some(regs::rdx()),
+        _ => None,
+    }
+}
+
+fn is_callee_save_systemv(r: RealReg) -> bool {
+    use regs::*;
+    match r.get_class() {
+        RegClass::I64 => match r.get_hw_encoding() as u8 {
+            ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true,
+            _ => false,
+        },
+        _ => unimplemented!(),
+    }
+}
+
+fn get_callee_saves(regs: Vec<Writable<RealReg>>) -> Vec<Writable<RealReg>> {
+    regs.into_iter()
+        .filter(|r| is_callee_save_systemv(r.to_reg()))
+        .collect()
+}
+
+impl X64ABIBody {
+    /// Create a new body ABI instance.
+    pub(crate) fn new(f: &ir::Function, flags: settings::Flags) -> Self {
+        // Compute args and retvals from signature.
+        let mut args = vec![];
+        let mut next_int_arg = 0;
+        for param in &f.signature.params {
+            match param.purpose {
+                ir::ArgumentPurpose::VMContext if f.signature.call_conv.extends_baldrdash() => {
+                    // `VMContext` is `r14` in Baldrdash.
+                    args.push(ABIArg::Reg(regs::r14().to_real_reg()));
+                }
+
+                ir::ArgumentPurpose::Normal | ir::ArgumentPurpose::VMContext => {
+                    if in_int_reg(param.value_type) {
+                        if let Some(reg) = get_intreg_for_arg_systemv(next_int_arg) {
+                            args.push(ABIArg::Reg(reg.to_real_reg()));
+                        } else {
+                            unimplemented!("passing arg on the stack");
+                        }
+                        next_int_arg += 1;
+                    } else {
+                        unimplemented!("non int normal register")
+                    }
+                }
+
+                _ => unimplemented!("other parameter purposes"),
+            }
+        }
+
+        let mut rets = vec![];
+        let mut next_int_retval = 0;
+        for ret in &f.signature.returns {
+            match ret.purpose {
+                ir::ArgumentPurpose::Normal => {
+                    if in_int_reg(ret.value_type) {
+                        if let Some(reg) = get_intreg_for_retval_systemv(next_int_retval) {
+                            rets.push(ABIRet::Reg(reg.to_real_reg()));
+                        } else {
+                            unimplemented!("passing return on the stack");
+                        }
+                        next_int_retval += 1;
+                    } else {
+                        unimplemented!("returning non integer normal value");
+                    }
+                }
+
+                _ => {
+                    unimplemented!("non normal argument purpose");
+                }
+            }
+        }
+
+        // Compute stackslot locations and total stackslot size.
+        let mut stack_offset: usize = 0;
+        let mut _stack_slots = vec![];
+        for (stackslot, data) in f.stack_slots.iter() {
+            let off = stack_offset;
+            stack_offset += data.size as usize;
+
+            // 8-bit align.
+            stack_offset = (stack_offset + 7) & !7usize;
+
+            debug_assert_eq!(stackslot.as_u32() as usize, _stack_slots.len());
+            _stack_slots.push(off);
+        }
+
+        Self {
+            args,
+            rets,
+            _stack_slots,
+            stack_slots_size: stack_offset,
+            clobbered: Set::empty(),
+            num_spill_slots: None,
+            frame_size_bytes: None,
+            call_conv: f.signature.call_conv.clone(),
+            flags,
+        }
+    }
+}
+
+impl ABIBody for X64ABIBody {
+    type I = Inst;
+
+    fn temp_needed(&self) -> bool {
+        false
+    }
+
+    fn init(&mut self, _: Option<Writable<Reg>>) {}
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn num_args(&self) -> usize {
+        unimplemented!()
+    }
+
+    fn num_retvals(&self) -> usize {
+        unimplemented!()
+    }
+
+    fn num_stackslots(&self) -> usize {
+        unimplemented!()
+    }
+
+    fn liveins(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for arg in &self.args {
+            if let &ABIArg::Reg(r) = arg {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn liveouts(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for ret in &self.rets {
+            if let &ABIRet::Reg(r) = ret {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn gen_copy_arg_to_reg(&self, idx: usize, to_reg: Writable<Reg>) -> Inst {
+        match &self.args[idx] {
+            ABIArg::Reg(from_reg) => {
+                if from_reg.get_class() == RegClass::I32 || from_reg.get_class() == RegClass::I64 {
+                    // TODO do we need a sign extension if it's I32?
+                    return Inst::mov_r_r(/*is64=*/ true, from_reg.to_reg(), to_reg);
+                }
+                unimplemented!("moving from non-int arg to vreg");
+            }
+            ABIArg::_Stack => unimplemented!("moving from stack arg to vreg"),
+        }
+    }
+
+    fn gen_retval_area_setup(&self) -> Option<Inst> {
+        None
+    }
+
+    fn gen_copy_reg_to_retval(
+        &self,
+        idx: usize,
+        from_reg: Writable<Reg>,
+        ext: ArgumentExtension,
+    ) -> Vec<Inst> {
+        match ext {
+            ArgumentExtension::None => {}
+            _ => unimplemented!(
+                "unimplemented argument extension {:?} is required for baldrdash",
+                ext
+            ),
+        };
+
+        let mut ret = Vec::new();
+        match &self.rets[idx] {
+            ABIRet::Reg(to_reg) => {
+                if to_reg.get_class() == RegClass::I32 || to_reg.get_class() == RegClass::I64 {
+                    ret.push(Inst::mov_r_r(
+                        /*is64=*/ true,
+                        from_reg.to_reg(),
+                        Writable::<Reg>::from_reg(to_reg.to_reg()),
+                    ))
+                } else {
+                    unimplemented!("moving from vreg to non-int return value");
+                }
+            }
+
+            ABIRet::_Stack => {
+                unimplemented!("moving from vreg to stack return value");
+            }
+        }
+
+        ret
+    }
+
+    fn gen_ret(&self) -> Inst {
+        Inst::ret()
+    }
+
+    fn gen_epilogue_placeholder(&self) -> Inst {
+        Inst::epilogue_placeholder()
+    }
+
+    fn set_num_spillslots(&mut self, slots: usize) {
+        self.num_spill_slots = Some(slots);
+    }
+
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) {
+        self.clobbered = clobbered;
+    }
+
+    fn stackslot_addr(&self, _slot: StackSlot, _offset: u32, _into_reg: Writable<Reg>) -> Inst {
+        unimplemented!()
+    }
+
+    fn load_stackslot(
+        &self,
+        _slot: StackSlot,
+        _offset: u32,
+        _ty: Type,
+        _into_reg: Writable<Reg>,
+    ) -> Inst {
+        unimplemented!("load_stackslot")
+    }
+
+    fn store_stackslot(&self, _slot: StackSlot, _offset: u32, _ty: Type, _from_reg: Reg) -> Inst {
+        unimplemented!("store_stackslot")
+    }
+
+    fn load_spillslot(&self, _slot: SpillSlot, _ty: Type, _into_reg: Writable<Reg>) -> Inst {
+        unimplemented!("load_spillslot")
+    }
+
+    fn store_spillslot(&self, _slot: SpillSlot, _ty: Type, _from_reg: Reg) -> Inst {
+        unimplemented!("store_spillslot")
+    }
+
+    fn gen_prologue(&mut self) -> Vec<Inst> {
+        let r_rsp = regs::rsp();
+
+        let mut insts = vec![];
+
+        // Baldrdash generates its own prologue sequence, so we don't have to.
+        if !self.call_conv.extends_baldrdash() {
+            let r_rbp = regs::rbp();
+            let w_rbp = Writable::<Reg>::from_reg(r_rbp);
+
+            // The "traditional" pre-preamble
+            // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
+            insts.push(Inst::push64(RMI::reg(r_rbp)));
+            // RSP is now 0 % 16
+            insts.push(Inst::mov_r_r(true, r_rsp, w_rbp));
+        }
+
+        // Save callee saved registers that we trash. Keep track of how much space we've used, so
+        // as to know what we have to do to get the base of the spill area 0 % 16.
+        let mut callee_saved_used = 0;
+        let clobbered = get_callee_saves(self.clobbered.to_vec());
+        for reg in clobbered {
+            let r_reg = reg.to_reg();
+            match r_reg.get_class() {
+                RegClass::I64 => {
+                    insts.push(Inst::push64(RMI::reg(r_reg.to_reg())));
+                    callee_saved_used += 8;
+                }
+                _ => unimplemented!(),
+            }
+        }
+
+        let mut total_stacksize = self.stack_slots_size + 8 * self.num_spill_slots.unwrap();
+        if self.call_conv.extends_baldrdash() {
+            // Baldrdash expects the stack to take at least the number of words set in
+            // baldrdash_prologue_words; count them here.
+            debug_assert!(
+                !self.flags.enable_probestack(),
+                "baldrdash does not expect cranelift to emit stack probes"
+            );
+            total_stacksize += self.flags.baldrdash_prologue_words() as usize * 8;
+        }
+
+        debug_assert!(callee_saved_used % 16 == 0 || callee_saved_used % 16 == 8);
+        let frame_size = total_stacksize + callee_saved_used % 16;
+
+        // Now make sure the frame stack is aligned, so RSP == 0 % 16 in the function's body.
+        let frame_size = (frame_size + 15) & !15;
+        if frame_size > 0x7FFF_FFFF {
+            unimplemented!("gen_prologue(x86): total_stacksize >= 2G");
+        }
+
+        if !self.call_conv.extends_baldrdash() {
+            // Explicitly allocate the frame.
+            let w_rsp = Writable::<Reg>::from_reg(r_rsp);
+            if frame_size > 0 {
+                insts.push(Inst::alu_rmi_r(
+                    true,
+                    RMI_R_Op::Sub,
+                    RMI::imm(frame_size as u32),
+                    w_rsp,
+                ));
+            }
+        }
+
+        // Stash this value.  We'll need it for the epilogue.
+        debug_assert!(self.frame_size_bytes.is_none());
+        self.frame_size_bytes = Some(frame_size);
+
+        insts
+    }
+
+    fn gen_epilogue(&self) -> Vec<Inst> {
+        let mut insts = vec![];
+
+        // Undo what we did in the prologue.
+
+        // Clear the spill area and the 16-alignment padding below it.
+        if !self.call_conv.extends_baldrdash() {
+            let frame_size = self.frame_size_bytes.unwrap();
+            if frame_size > 0 {
+                let r_rsp = regs::rsp();
+                let w_rsp = Writable::<Reg>::from_reg(r_rsp);
+
+                insts.push(Inst::alu_rmi_r(
+                    true,
+                    RMI_R_Op::Add,
+                    RMI::imm(frame_size as u32),
+                    w_rsp,
+                ));
+            }
+        }
+
+        // Restore regs.
+        let clobbered = get_callee_saves(self.clobbered.to_vec());
+        for w_real_reg in clobbered.into_iter().rev() {
+            match w_real_reg.to_reg().get_class() {
+                RegClass::I64 => {
+                    // TODO: make these conversion sequences less cumbersome.
+                    insts.push(Inst::pop64(Writable::<Reg>::from_reg(
+                        w_real_reg.to_reg().to_reg(),
+                    )))
+                }
+                _ => unimplemented!(),
+            }
+        }
+
+        // Baldrdash generates its own preamble.
+        if !self.call_conv.extends_baldrdash() {
+            let r_rbp = regs::rbp();
+            let w_rbp = Writable::<Reg>::from_reg(r_rbp);
+
+            // Undo the "traditional" pre-preamble
+            // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
+            insts.push(Inst::pop64(w_rbp));
+            insts.push(Inst::ret());
+        }
+
+        insts
+    }
+
+    fn frame_size(&self) -> u32 {
+        self.frame_size_bytes
+            .expect("frame size not computed before prologue generation") as u32
+    }
+
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, F32) | (RegClass::V128, F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn gen_spill(&self, _to_slot: SpillSlot, _from_reg: RealReg, _ty: Type) -> Inst {
+        unimplemented!()
+    }
+
+    fn gen_reload(&self, _to_reg: Writable<RealReg>, _from_slot: SpillSlot, _ty: Type) -> Inst {
+        unimplemented!()
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
@ -0,0 +1,420 @@
+//! Instruction operand sub-components (aka "parts"): definitions and printing.
+
+use std::fmt;
+use std::string::{String, ToString};
+
+use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageCollector};
+
+use crate::machinst::*;
+
+use super::regs::show_ireg_sized;
+
+/// A Memory Address. These denote a 64-bit value only.
+#[derive(Clone)]
+pub(crate) enum Addr {
+    /// Immediate sign-extended and a Register.
+    IR { simm32: u32, base: Reg },
+
+    /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift)
+    IRRS {
+        simm32: u32,
+        base: Reg,
+        index: Reg,
+        shift: u8, /* 0 .. 3 only */
+    },
+}
+
+impl Addr {
+    // Constructors.
+
+    pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        Self::IR { simm32, base }
+    }
+
+    pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        debug_assert!(index.get_class() == RegClass::I64);
+        debug_assert!(shift <= 3);
+        Addr::IRRS {
+            simm32,
+            base,
+            index,
+            shift,
+        }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            Addr::IR { simm32: _, base } => {
+                collector.add_use(*base);
+            }
+            Addr::IRRS {
+                simm32: _,
+                base,
+                index,
+                shift: _,
+            } => {
+                collector.add_use(*base);
+                collector.add_use(*index);
+            }
+        }
+    }
+}
+
+impl ShowWithRRU for Addr {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            Addr::IR { simm32, base } => format!("{}({})", *simm32 as i32, base.show_rru(mb_rru)),
+            Addr::IRRS {
+                simm32,
+                base,
+                index,
+                shift,
+            } => format!(
+                "{}({},{},{})",
+                *simm32 as i32,
+                base.show_rru(mb_rru),
+                index.show_rru(mb_rru),
+                1 << shift
+            ),
+        }
+    }
+}
+
+/// An operand which is either an integer Register, a value in Memory or an Immediate.  This can
+/// denote an 8, 16, 32 or 64 bit value.  For the Immediate form, in the 8- and 16-bit case, only
+/// the lower 8 or 16 bits of `simm32` is relevant.  In the 64-bit case, the value denoted by
+/// `simm32` is its sign-extension out to 64 bits.
+#[derive(Clone)]
+pub(crate) enum RMI {
+    R { reg: Reg },
+    M { addr: Addr },
+    I { simm32: u32 },
+}
+
+impl RMI {
+    // Constructors
+
+    pub(crate) fn reg(reg: Reg) -> RMI {
+        debug_assert!(reg.get_class() == RegClass::I64);
+        RMI::R { reg }
+    }
+    pub(crate) fn mem(addr: Addr) -> RMI {
+        RMI::M { addr }
+    }
+    pub(crate) fn imm(simm32: u32) -> RMI {
+        RMI::I { simm32 }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            RMI::R { reg } => collector.add_use(*reg),
+            RMI::M { addr } => addr.get_regs_as_uses(collector),
+            RMI::I { simm32: _ } => {}
+        }
+    }
+}
+
+impl ShowWithRRU for RMI {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            RMI::R { reg } => show_ireg_sized(*reg, mb_rru, size),
+            RMI::M { addr } => addr.show_rru(mb_rru),
+            RMI::I { simm32 } => format!("${}", *simm32 as i32),
+        }
+    }
+}
+
+/// An operand which is either an integer Register or a value in Memory.  This can denote an 8, 16,
+/// 32 or 64 bit value.
+#[derive(Clone)]
+pub(crate) enum RM {
+    R { reg: Reg },
+    M { addr: Addr },
+}
+
+impl RM {
+    // Constructors.
+
+    pub(crate) fn reg(reg: Reg) -> Self {
+        debug_assert!(reg.get_class() == RegClass::I64);
+        RM::R { reg }
+    }
+
+    pub(crate) fn mem(addr: Addr) -> Self {
+        RM::M { addr }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            RM::R { reg } => collector.add_use(*reg),
+            RM::M { addr } => addr.get_regs_as_uses(collector),
+        }
+    }
+}
+
+impl ShowWithRRU for RM {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            RM::R { reg } => show_ireg_sized(*reg, mb_rru, size),
+            RM::M { addr } => addr.show_rru(mb_rru),
+        }
+    }
+}
+
+/// Some basic ALU operations.  TODO: maybe add Adc, Sbb.
+#[derive(Clone, PartialEq)]
+pub enum RMI_R_Op {
+    Add,
+    Sub,
+    And,
+    Or,
+    Xor,
+    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
+    Mul,
+}
+
+impl RMI_R_Op {
+    pub(crate) fn to_string(&self) -> String {
+        match self {
+            RMI_R_Op::Add => "add".to_string(),
+            RMI_R_Op::Sub => "sub".to_string(),
+            RMI_R_Op::And => "and".to_string(),
+            RMI_R_Op::Or => "or".to_string(),
+            RMI_R_Op::Xor => "xor".to_string(),
+            RMI_R_Op::Mul => "imul".to_string(),
+        }
+    }
+}
+
+impl fmt::Debug for RMI_R_Op {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        write!(fmt, "{}", self.to_string())
+    }
+}
+
+/// These indicate ways of extending (widening) a value, using the Intel naming:
+/// B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64
+#[derive(Clone, PartialEq)]
+pub enum ExtMode {
+    /// Byte -> Longword.
+    BL,
+    /// Byte -> Quadword.
+    BQ,
+    /// Word -> Longword.
+    WL,
+    /// Word -> Quadword.
+    WQ,
+    /// Longword -> Quadword.
+    LQ,
+}
+
+impl ExtMode {
+    pub(crate) fn to_string(&self) -> String {
+        match self {
+            ExtMode::BL => "bl".to_string(),
+            ExtMode::BQ => "bq".to_string(),
+            ExtMode::WL => "wl".to_string(),
+            ExtMode::WQ => "wq".to_string(),
+            ExtMode::LQ => "lq".to_string(),
+        }
+    }
+
+    pub(crate) fn dst_size(&self) -> u8 {
+        match self {
+            ExtMode::BL => 4,
+            ExtMode::BQ => 8,
+            ExtMode::WL => 4,
+            ExtMode::WQ => 8,
+            ExtMode::LQ => 8,
+        }
+    }
+}
+
+impl fmt::Debug for ExtMode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        write!(fmt, "{}", self.to_string())
+    }
+}
+
+/// These indicate the form of a scalar shift: left, signed right, unsigned right.
+#[derive(Clone)]
+pub enum ShiftKind {
+    Left,
+    RightZ,
+    RightS,
+}
+
+impl ShiftKind {
+    pub(crate) fn to_string(&self) -> String {
+        match self {
+            ShiftKind::Left => "shl".to_string(),
+            ShiftKind::RightZ => "shr".to_string(),
+            ShiftKind::RightS => "sar".to_string(),
+        }
+    }
+}
+
+impl fmt::Debug for ShiftKind {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        write!(fmt, "{}", self.to_string())
+    }
+}
+
+/// These indicate condition code tests.  Not all are represented since not all are useful in
+/// compiler-generated code.
+#[derive(Copy, Clone)]
+#[repr(u8)]
+pub enum CC {
+    ///  overflow
+    O = 0,
+    /// no overflow
+    NO = 1,
+
+    /// < unsigned
+    B = 2,
+    /// >= unsigned
+    NB = 3,
+
+    /// zero
+    Z = 4,
+    /// not-zero
+    NZ = 5,
+
+    /// <= unsigned
+    BE = 6,
+    /// > unsigend
+    NBE = 7,
+
+    /// negative
+    S = 8,
+    /// not-negative
+    NS = 9,
+
+    /// < signed
+    L = 12,
+    /// >= signed
+    NL = 13,
+
+    /// <= signed
+    LE = 14,
+    /// > signed
+    NLE = 15,
+}
+
+impl CC {
+    pub(crate) fn to_string(&self) -> String {
+        match self {
+            CC::O => "o".to_string(),
+            CC::NO => "no".to_string(),
+            CC::B => "b".to_string(),
+            CC::NB => "nb".to_string(),
+            CC::Z => "z".to_string(),
+            CC::NZ => "nz".to_string(),
+            CC::BE => "be".to_string(),
+            CC::NBE => "nbe".to_string(),
+            CC::S => "s".to_string(),
+            CC::NS => "ns".to_string(),
+            CC::L => "l".to_string(),
+            CC::NL => "nl".to_string(),
+            CC::LE => "le".to_string(),
+            CC::NLE => "nle".to_string(),
+        }
+    }
+
+    pub(crate) fn invert(&self) -> CC {
+        match self {
+            CC::O => CC::NO,
+            CC::NO => CC::O,
+
+            CC::B => CC::NB,
+            CC::NB => CC::B,
+
+            CC::Z => CC::NZ,
+            CC::NZ => CC::Z,
+
+            CC::BE => CC::NBE,
+            CC::NBE => CC::BE,
+
+            CC::S => CC::NS,
+            CC::NS => CC::S,
+
+            CC::L => CC::NL,
+            CC::NL => CC::L,
+
+            CC::LE => CC::NLE,
+            CC::NLE => CC::LE,
+        }
+    }
+
+    pub(crate) fn get_enc(self) -> u8 {
+        self as u8
+    }
+}
+
+impl fmt::Debug for CC {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        write!(fmt, "{}", self.to_string())
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug)]
+pub enum BranchTarget {
+    /// An unresolved reference to a MachLabel.
+    Label(MachLabel),
+
+    /// A resolved reference to another instruction, in bytes.
+    ResolvedOffset(isize),
+}
+
+impl ShowWithRRU for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            BranchTarget::Label(l) => format!("{:?}", l),
+            BranchTarget::ResolvedOffset(offs) => format!("(offset {})", offs),
+        }
+    }
+}
+
+impl BranchTarget {
+    /// Get the label.
+    pub fn as_label(&self) -> Option<MachLabel> {
+        match self {
+            &BranchTarget::Label(l) => Some(l),
+            _ => None,
+        }
+    }
+
+    /// Get the offset as a signed 32 bit byte offset.  This returns the
+    /// offset in bytes between the first byte of the source and the first
+    /// byte of the target.  It does not take into account the Intel-specific
+    /// rule that a branch offset is encoded as relative to the start of the
+    /// following instruction.  That is a problem for the emitter to deal
+    /// with. If a label, returns zero.
+    pub fn as_offset32_or_zero(&self) -> i32 {
+        match self {
+            &BranchTarget::ResolvedOffset(off) => {
+                // Leave a bit of slack so that the emitter is guaranteed to
+                // be able to add the length of the jump instruction encoding
+                // to this value and still have a value in signed-32 range.
+                assert!(off >= -0x7FFF_FF00 && off <= 0x7FFF_FF00);
+                off as i32
+            }
+            _ => 0,
+        }
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
@ -0,0 +1,892 @@
+use regalloc::{Reg, RegClass};
+
+use crate::isa::x64::inst::*;
+
+fn low8willSXto64(x: u32) -> bool {
+    let xs = (x as i32) as i64;
+    xs == ((xs << 56) >> 56)
+}
+
+fn low8willSXto32(x: u32) -> bool {
+    let xs = x as i32;
+    xs == ((xs << 24) >> 24)
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+// For all of the routines that take both a memory-or-reg operand (sometimes
+// called "E" in the Intel documentation) and a reg-only operand ("G" in
+// Intelese), the order is always G first, then E.
+//
+// "enc" in the following means "hardware register encoding number".
+
+#[inline(always)]
+fn mkModRegRM(m0d: u8, encRegG: u8, rmE: u8) -> u8 {
+    debug_assert!(m0d < 4);
+    debug_assert!(encRegG < 8);
+    debug_assert!(rmE < 8);
+    ((m0d & 3) << 6) | ((encRegG & 7) << 3) | (rmE & 7)
+}
+
+#[inline(always)]
+fn mkSIB(shift: u8, encIndex: u8, encBase: u8) -> u8 {
+    debug_assert!(shift < 4);
+    debug_assert!(encIndex < 8);
+    debug_assert!(encBase < 8);
+    ((shift & 3) << 6) | ((encIndex & 7) << 3) | (encBase & 7)
+}
+
+/// Get the encoding number from something which we sincerely hope is a real
+/// register of class I64.
+#[inline(always)]
+fn iregEnc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    debug_assert!(reg.get_class() == RegClass::I64);
+    reg.get_hw_encoding()
+}
+
+// F_*: these flags describe special handling of the insn to be generated.  Be
+// careful with these.  It is easy to create nonsensical combinations.
+const F_NONE: u32 = 0;
+
+/// Emit the REX prefix byte even if it appears to be redundant (== 0x40).
+const F_RETAIN_REDUNDANT_REX: u32 = 1;
+
+/// Set the W bit in the REX prefix to zero.  By default it will be set to 1,
+/// indicating a 64-bit operation.
+const F_CLEAR_REX_W: u32 = 2;
+
+/// Add an 0x66 (operand-size override) prefix.  This is necessary to indicate
+/// a 16-bit operation.  Normally this will be used together with F_CLEAR_REX_W.
+const F_PREFIX_66: u32 = 4;
+
+/// This is the core 'emit' function for instructions that reference memory.
+///
+/// For an instruction that has as operands a register `encG` and a memory
+/// address `memE`, create and emit, first the REX prefix, then caller-supplied
+/// opcode byte(s) (`opcodes` and `numOpcodes`), then the MOD/RM byte, then
+/// optionally, a SIB byte, and finally optionally an immediate that will be
+/// derived from the `memE` operand.  For most instructions up to and including
+/// SSE4.2, that will be the whole instruction.
+///
+/// The opcodes are written bigendianly for the convenience of callers.  For
+/// example, if the opcode bytes to be emitted are, in this order, F3 0F 27,
+/// then the caller should pass `opcodes` == 0xF3_0F_27 and `numOpcodes` == 3.
+///
+/// The register operand is represented here not as a `Reg` but as its hardware
+/// encoding, `encG`.  `flags` can specify special handling for the REX prefix.
+/// By default, the REX prefix will indicate a 64-bit operation and will be
+/// deleted if it is redundant (0x40).  Note that for a 64-bit operation, the
+/// REX prefix will normally never be redundant, since REX.W must be 1 to
+/// indicate a 64-bit operation.
+fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+    sink: &mut MachBuffer<Inst>,
+    opcodes: u32,
+    mut numOpcodes: usize,
+    encG: u8,
+    memE: &Addr,
+    flags: u32,
+) {
+    // General comment for this function: the registers in `memE` must be
+    // 64-bit integer registers, because they are part of an address
+    // expression.  But `encG` can be derived from a register of any class.
+    let prefix66 = (flags & F_PREFIX_66) != 0;
+    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
+    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
+    // The operand-size override, if requested.  This indicates a 16-bit
+    // operation.
+    if prefix66 {
+        sink.put1(0x66);
+    }
+    match memE {
+        Addr::IR { simm32, base: regE } => {
+            // First, cook up the REX byte.  This is easy.
+            let encE = iregEnc(*regE);
+            let w = if clearRexW { 0 } else { 1 };
+            let r = (encG >> 3) & 1;
+            let x = 0;
+            let b = (encE >> 3) & 1;
+            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+            if rex != 0x40 || retainRedundant {
+                sink.put1(rex);
+            }
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while numOpcodes > 0 {
+                numOpcodes -= 1;
+                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+            }
+            // Now the mod/rm and associated immediates.  This is
+            // significantly complicated due to the multiple special cases.
+            if *simm32 == 0
+                && encE != regs::ENC_RSP
+                && encE != regs::ENC_RBP
+                && encE != regs::ENC_R12
+                && encE != regs::ENC_R13
+            {
+                // FIXME JRS 2020Feb11: those four tests can surely be
+                // replaced by a single mask-and-compare check.  We should do
+                // that because this routine is likely to be hot.
+                sink.put1(mkModRegRM(0, encG & 7, encE & 7));
+            } else if *simm32 == 0 && (encE == regs::ENC_RSP || encE == regs::ENC_R12) {
+                sink.put1(mkModRegRM(0, encG & 7, 4));
+                sink.put1(0x24);
+            } else if low8willSXto32(*simm32) && encE != regs::ENC_RSP && encE != regs::ENC_R12 {
+                sink.put1(mkModRegRM(1, encG & 7, encE & 7));
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if encE != regs::ENC_RSP && encE != regs::ENC_R12 {
+                sink.put1(mkModRegRM(2, encG & 7, encE & 7));
+                sink.put4(*simm32);
+            } else if (encE == regs::ENC_RSP || encE == regs::ENC_R12) && low8willSXto32(*simm32) {
+                // REX.B distinguishes RSP from R12
+                sink.put1(mkModRegRM(1, encG & 7, 4));
+                sink.put1(0x24);
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if encE == regs::ENC_R12 || encE == regs::ENC_RSP {
+                //.. wait for test case for RSP case
+                // REX.B distinguishes RSP from R12
+                sink.put1(mkModRegRM(2, encG & 7, 4));
+                sink.put1(0x24);
+                sink.put4(*simm32);
+            } else {
+                unreachable!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IR");
+            }
+        }
+        // Bizarrely, the IRRS case is much simpler.
+        Addr::IRRS {
+            simm32,
+            base: regBase,
+            index: regIndex,
+            shift,
+        } => {
+            let encBase = iregEnc(*regBase);
+            let encIndex = iregEnc(*regIndex);
+            // The rex byte
+            let w = if clearRexW { 0 } else { 1 };
+            let r = (encG >> 3) & 1;
+            let x = (encIndex >> 3) & 1;
+            let b = (encBase >> 3) & 1;
+            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+            if rex != 0x40 || retainRedundant {
+                sink.put1(rex);
+            }
+            // All other prefixes and opcodes
+            while numOpcodes > 0 {
+                numOpcodes -= 1;
+                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+            }
+            // modrm, SIB, immediates
+            if low8willSXto32(*simm32) && encIndex != regs::ENC_RSP {
+                sink.put1(mkModRegRM(1, encG & 7, 4));
+                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
+                sink.put1(*simm32 as u8);
+            } else if encIndex != regs::ENC_RSP {
+                sink.put1(mkModRegRM(2, encG & 7, 4));
+                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
+                sink.put4(*simm32);
+            } else {
+                panic!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IRRS");
+            }
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that do not reference
+/// memory.
+///
+/// This is conceptually the same as
+/// emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE, except it is for the case
+/// where the E operand is a register rather than memory.  Hence it is much
+/// simpler.
+fn emit_REX_OPCODES_MODRM_encG_encE(
+    sink: &mut MachBuffer<Inst>,
+    opcodes: u32,
+    mut numOpcodes: usize,
+    encG: u8,
+    encE: u8,
+    flags: u32,
+) {
+    // EncG and EncE can be derived from registers of any class, and they
+    // don't even have to be from the same class.  For example, for an
+    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
+    // RegClass::V128.
+    let prefix66 = (flags & F_PREFIX_66) != 0;
+    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
+    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
+    // The operand-size override
+    if prefix66 {
+        sink.put1(0x66);
+    }
+    // The rex byte
+    let w = if clearRexW { 0 } else { 1 };
+    let r = (encG >> 3) & 1;
+    let x = 0;
+    let b = (encE >> 3) & 1;
+    let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+    if rex != 0x40 || retainRedundant {
+        sink.put1(rex);
+    }
+    // All other prefixes and opcodes
+    while numOpcodes > 0 {
+        numOpcodes -= 1;
+        sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+    }
+    // Now the mod/rm byte.  The instruction we're generating doesn't access
+    // memory, so there is no SIB byte or immediate -- we're done.
+    sink.put1(mkModRegRM(3, encG & 7, encE & 7));
+}
+
+// These are merely wrappers for the above two functions that facilitate passing
+// actual `Reg`s rather than their encodings.
+
+fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+    sink: &mut MachBuffer<Inst>,
+    opcodes: u32,
+    numOpcodes: usize,
+    regG: Reg,
+    memE: &Addr,
+    flags: u32,
+) {
+    // JRS FIXME 2020Feb07: this should really just be `regEnc` not `iregEnc`
+    let encG = iregEnc(regG);
+    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(sink, opcodes, numOpcodes, encG, memE, flags);
+}
+
+fn emit_REX_OPCODES_MODRM_regG_regE(
+    sink: &mut MachBuffer<Inst>,
+    opcodes: u32,
+    numOpcodes: usize,
+    regG: Reg,
+    regE: Reg,
+    flags: u32,
+) {
+    // JRS FIXME 2020Feb07: these should really just be `regEnc` not `iregEnc`
+    let encG = iregEnc(regG);
+    let encE = iregEnc(regE);
+    emit_REX_OPCODES_MODRM_encG_encE(sink, opcodes, numOpcodes, encG, encE, flags);
+}
+
+/// Write a suitable number of bits from an imm64 to the sink.
+fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
+    match size {
+        8 | 4 => sink.put4(simm32),
+        2 => sink.put2(simm32 as u16),
+        1 => sink.put1(simm32 as u8),
+        _ => panic!("x64::Inst::emit_simm: unreachable"),
+    }
+}
+
+/// The top-level emit function.
+///
+/// Important!  Do not add improved (shortened) encoding cases to existing
+/// instructions without also adding tests for those improved encodings.  That
+/// is a dangerous game that leads to hard-to-track-down errors in the emitted
+/// code.
+///
+/// For all instructions, make sure to have test coverage for all of the
+/// following situations.  Do this by creating the cross product resulting from
+/// applying the following rules to each operand:
+///
+/// (1) for any insn that mentions a register: one test using a register from
+///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
+///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
+///     This helps detect incorrect REX prefix construction.
+///
+/// (2) for any insn that mentions a byte register: one test for each of the
+///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
+///     [r8b .. r11b] and [r12b .. r15b].  This checks that
+///     apparently-redundant REX prefixes are retained when required.
+///
+/// (3) for any insn that contains an immediate field, check the following
+///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
+///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
+///     instructions that require a 32-bit immediate have a short-form encoding
+///     when the imm is in simm8 range.
+///
+/// Rules (1), (2) and (3) don't apply for registers within address expressions
+/// (`Addr`s).  Those are already pretty well tested, and the registers in them
+/// don't have any effect on the containing instruction (apart from possibly
+/// require REX prefix bits).
+///
+/// When choosing registers for a test, avoid using registers with the same
+/// offset within a given group.  For example, don't use rax and r8, since they
+/// both have the lowest 3 bits as 000, and so the test won't detect errors
+/// where those 3-bit register sub-fields are confused by the emitter.  Instead
+/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
+/// and bpl since they have the same offset in their group; use instead (eg) cl
+/// and sil.
+///
+/// For all instructions, also add a test that uses only low-half registers
+/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
+/// prefixes are correctly omitted.  This low-half restriction must apply to
+/// _all_ registers in the insn, even those in address expressions.
+///
+/// Following these rules creates large numbers of test cases, but it's the
+/// only way to make the emitter reliable.
+///
+/// Known possible improvements:
+///
+/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
+///   care?)
+pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
+    match inst {
+        Inst::Nop { len: 0 } => {}
+        Inst::Alu_RMI_R {
+            is_64,
+            op,
+            src: srcE,
+            dst: regG,
+        } => {
+            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
+            if *op == RMI_R_Op::Mul {
+                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
+                // we have to special-case it.
+                match srcE {
+                    RMI::R { reg: regE } => {
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            0x0FAF,
+                            2,
+                            regG.to_reg(),
+                            *regE,
+                            flags,
+                        );
+                    }
+                    RMI::M { addr } => {
+                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                            sink,
+                            0x0FAF,
+                            2,
+                            regG.to_reg(),
+                            addr,
+                            flags,
+                        );
+                    }
+                    RMI::I { simm32 } => {
+                        let useImm8 = low8willSXto32(*simm32);
+                        let opcode = if useImm8 { 0x6B } else { 0x69 };
+                        // Yes, really, regG twice.
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            opcode,
+                            1,
+                            regG.to_reg(),
+                            regG.to_reg(),
+                            flags,
+                        );
+                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            } else {
+                let (opcode_R, opcode_M, subopcode_I) = match op {
+                    RMI_R_Op::Add => (0x01, 0x03, 0),
+                    RMI_R_Op::Sub => (0x29, 0x2B, 5),
+                    RMI_R_Op::And => (0x21, 0x23, 4),
+                    RMI_R_Op::Or => (0x09, 0x0B, 1),
+                    RMI_R_Op::Xor => (0x31, 0x33, 6),
+                    RMI_R_Op::Mul => panic!("unreachable"),
+                };
+                match srcE {
+                    RMI::R { reg: regE } => {
+                        // Note.  The arguments .. regE .. regG .. sequence
+                        // here is the opposite of what is expected.  I'm not
+                        // sure why this is.  But I am fairly sure that the
+                        // arg order could be switched back to the expected
+                        // .. regG .. regE .. if opcode_rr is also switched
+                        // over to the "other" basic integer opcode (viz, the
+                        // R/RM vs RM/R duality).  However, that would mean
+                        // that the test results won't be in accordance with
+                        // the GNU as reference output.  In other words, the
+                        // inversion exists as a result of using GNU as as a
+                        // gold standard.
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            opcode_R,
+                            1,
+                            *regE,
+                            regG.to_reg(),
+                            flags,
+                        );
+                        // NB: if this is ever extended to handle byte size
+                        // ops, be sure to retain redundant REX prefixes.
+                    }
+                    RMI::M { addr } => {
+                        // Whereas here we revert to the "normal" G-E ordering.
+                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                            sink,
+                            opcode_M,
+                            1,
+                            regG.to_reg(),
+                            addr,
+                            flags,
+                        );
+                    }
+                    RMI::I { simm32 } => {
+                        let useImm8 = low8willSXto32(*simm32);
+                        let opcode = if useImm8 { 0x83 } else { 0x81 };
+                        // And also here we use the "normal" G-E ordering.
+                        let encG = iregEnc(regG.to_reg());
+                        emit_REX_OPCODES_MODRM_encG_encE(sink, opcode, 1, subopcode_I, encG, flags);
+                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            }
+        }
+        Inst::Imm_R {
+            dst_is_64,
+            simm64,
+            dst,
+        } => {
+            let encDst = iregEnc(dst.to_reg());
+            if *dst_is_64 {
+                // FIXME JRS 2020Feb10: also use the 32-bit case here when
+                // possible
+                sink.put1(0x48 | ((encDst >> 3) & 1));
+                sink.put1(0xB8 | (encDst & 7));
+                sink.put8(*simm64);
+            } else {
+                if ((encDst >> 3) & 1) == 1 {
+                    sink.put1(0x41);
+                }
+                sink.put1(0xB8 | (encDst & 7));
+                sink.put4(*simm64 as u32);
+            }
+        }
+        Inst::Mov_R_R { is_64, src, dst } => {
+            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
+            emit_REX_OPCODES_MODRM_regG_regE(sink, 0x89, 1, *src, dst.to_reg(), flags);
+        }
+        Inst::MovZX_M_R { extMode, addr, dst } => {
+            match extMode {
+                ExtMode::BL => {
+                    // MOVZBL is (REX.W==0) 0F B6 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB6,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::BQ => {
+                    // MOVZBQ is (REX.W==1) 0F B6 /r
+                    // I'm not sure why the Intel manual offers different
+                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
+                    // achieve the same, since MOVZBL is just going to zero out
+                    // the upper half of the destination anyway.
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB6,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::WL => {
+                    // MOVZWL is (REX.W==0) 0F B7 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB7,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::WQ => {
+                    // MOVZWQ is (REX.W==1) 0F B7 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB7,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::LQ => {
+                    // This is just a standard 32 bit load, and we rely on the
+                    // default zero-extension rule to perform the extension.
+                    // MOV r/m32, r32 is (REX.W==0) 8B /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x8B,
+                        1,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+            }
+        }
+        Inst::Mov64_M_R { addr, dst } => {
+            emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x8B, 1, dst.to_reg(), addr, F_NONE)
+        }
+        Inst::MovSX_M_R { extMode, addr, dst } => {
+            match extMode {
+                ExtMode::BL => {
+                    // MOVSBL is (REX.W==0) 0F BE /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBE,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::BQ => {
+                    // MOVSBQ is (REX.W==1) 0F BE /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBE,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::WL => {
+                    // MOVSWL is (REX.W==0) 0F BF /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBF,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::WQ => {
+                    // MOVSWQ is (REX.W==1) 0F BF /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBF,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::LQ => {
+                    // MOVSLQ is (REX.W==1) 63 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x63,
+                        1,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+            }
+        }
+        Inst::Mov_R_M { size, src, addr } => {
+            match size {
+                1 => {
+                    // This is one of the few places where the presence of a
+                    // redundant REX prefix changes the meaning of the
+                    // instruction.
+                    let encSrc = iregEnc(*src);
+                    let retainRedundantRex = if encSrc >= 4 && encSrc <= 7 {
+                        F_RETAIN_REDUNDANT_REX
+                    } else {
+                        0
+                    };
+                    // MOV r8, r/m8 is (REX.W==0) 88 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x88,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W | retainRedundantRex,
+                    )
+                }
+                2 => {
+                    // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x89,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W | F_PREFIX_66,
+                    )
+                }
+                4 => {
+                    // MOV r32, r/m32 is (REX.W==0) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x89,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                8 => {
+                    // MOV r64, r/m64 is (REX.W==1) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x89, 1, *src, addr, F_NONE)
+                }
+                _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"),
+            }
+        }
+        Inst::Shift_R {
+            is_64,
+            kind,
+            num_bits,
+            dst,
+        } => {
+            let encDst = iregEnc(dst.to_reg());
+            let subopcode = match kind {
+                ShiftKind::Left => 4,
+                ShiftKind::RightZ => 5,
+                ShiftKind::RightS => 7,
+            };
+            match num_bits {
+                None => {
+                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xD3,
+                        1,
+                        subopcode,
+                        encDst,
+                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
+                    );
+                }
+                Some(num_bits) => {
+                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
+                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
+                    // When the shift amount is 1, there's an even shorter encoding, but we don't
+                    // bother with that nicety here.
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xC1,
+                        1,
+                        subopcode,
+                        encDst,
+                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
+                    );
+                    sink.put1(*num_bits);
+                }
+            }
+        }
+        Inst::Cmp_RMI_R {
+            size,
+            src: srcE,
+            dst: regG,
+        } => {
+            let mut retainRedundantRex = 0;
+            if *size == 1 {
+                // Here, a redundant REX prefix changes the meaning of the
+                // instruction.
+                let encG = iregEnc(*regG);
+                if encG >= 4 && encG <= 7 {
+                    retainRedundantRex = F_RETAIN_REDUNDANT_REX;
+                }
+            }
+            let mut flags = match size {
+                8 => F_NONE,
+                4 => F_CLEAR_REX_W,
+                2 => F_CLEAR_REX_W | F_PREFIX_66,
+                1 => F_CLEAR_REX_W | retainRedundantRex,
+                _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
+            };
+            match srcE {
+                RMI::R { reg: regE } => {
+                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
+                    if *size == 1 {
+                        // We also need to check whether the E register forces
+                        // the use of a redundant REX.
+                        let encE = iregEnc(*regE);
+                        if encE >= 4 && encE <= 7 {
+                            flags |= F_RETAIN_REDUNDANT_REX;
+                        }
+                    }
+                    // Same comment re swapped args as for Alu_RMI_R.
+                    emit_REX_OPCODES_MODRM_regG_regE(sink, opcode, 1, *regE, *regG, flags);
+                }
+                RMI::M { addr } => {
+                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    // Whereas here we revert to the "normal" G-E ordering.
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, opcode, 1, *regG, addr, flags);
+                }
+                RMI::I { simm32 } => {
+                    // FIXME JRS 2020Feb11: there are shorter encodings for
+                    // cmp $imm, rax/eax/ax/al.
+                    let useImm8 = low8willSXto32(*simm32);
+                    let opcode = if *size == 1 {
+                        0x80
+                    } else if useImm8 {
+                        0x83
+                    } else {
+                        0x81
+                    };
+                    // And also here we use the "normal" G-E ordering.
+                    let encG = iregEnc(*regG);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink, opcode, 1, 7, /*subopcode*/
+                        encG, flags,
+                    );
+                    emit_simm(sink, if useImm8 { 1 } else { *size }, *simm32);
+                }
+            }
+        }
+        Inst::Push64 { src } => {
+            match src {
+                RMI::R { reg } => {
+                    let encReg = iregEnc(*reg);
+                    let rex = 0x40 | ((encReg >> 3) & 1);
+                    if rex != 0x40 {
+                        sink.put1(rex);
+                    }
+                    sink.put1(0x50 | (encReg & 7));
+                }
+                RMI::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        6, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RMI::I { simm32 } => {
+                    if low8willSXto64(*simm32) {
+                        sink.put1(0x6A);
+                        sink.put1(*simm32 as u8);
+                    } else {
+                        sink.put1(0x68);
+                        sink.put4(*simm32);
+                    }
+                }
+            }
+        }
+        Inst::Pop64 { dst } => {
+            let encDst = iregEnc(dst.to_reg());
+            if encDst >= 8 {
+                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant
+                // here.
+                sink.put1(0x41);
+            }
+            sink.put1(0x58 + (encDst & 7));
+        }
+        //
+        // ** Inst::CallKnown
+        //
+        Inst::CallUnknown { dest } => {
+            match dest {
+                RM::R { reg } => {
+                    let regEnc = iregEnc(*reg);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        regEnc,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RM::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+            }
+        }
+        Inst::Ret {} => sink.put1(0xC3),
+
+        Inst::JmpKnown { dest } => {
+            let disp = dest.as_offset32_or_zero() - 5;
+            let disp = disp as u32;
+            let br_start = sink.cur_offset();
+            let br_disp_off = br_start + 1;
+            let br_end = br_start + 5;
+            if let Some(l) = dest.as_label() {
+                sink.use_label_at_offset(br_disp_off, l, LabelUse::Rel32);
+                sink.add_uncond_branch(br_start, br_end, l);
+            }
+            sink.put1(0xE9);
+            sink.put4(disp);
+        }
+        Inst::JmpCondSymm {
+            cc,
+            taken,
+            not_taken,
+        } => {
+            // Conditional part.
+
+            // This insn is 6 bytes long.  Currently `offset` is relative to
+            // the start of this insn, but the Intel encoding requires it to
+            // be relative to the start of the next instruction.  Hence the
+            // adjustment.
+            let taken_disp = taken.as_offset32_or_zero() - 6;
+            let taken_disp = taken_disp as u32;
+            let cond_start = sink.cur_offset();
+            let cond_disp_off = cond_start + 2;
+            let cond_end = cond_start + 6;
+            if let Some(l) = taken.as_label() {
+                sink.use_label_at_offset(cond_disp_off, l, LabelUse::Rel32);
+                let inverted: [u8; 6] =
+                    [0x0F, 0x80 + (cc.invert().get_enc()), 0xFA, 0xFF, 0xFF, 0xFF];
+                sink.add_cond_branch(cond_start, cond_end, l, &inverted[..]);
+            }
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            sink.put4(taken_disp);
+
+            // Unconditional part.
+
+            let nt_disp = not_taken.as_offset32_or_zero() - 5;
+            let nt_disp = nt_disp as u32;
+            let uncond_start = sink.cur_offset();
+            let uncond_disp_off = uncond_start + 1;
+            let uncond_end = uncond_start + 5;
+            if let Some(l) = not_taken.as_label() {
+                sink.use_label_at_offset(uncond_disp_off, l, LabelUse::Rel32);
+                sink.add_uncond_branch(uncond_start, uncond_end, l);
+            }
+            sink.put1(0xE9);
+            sink.put4(nt_disp);
+        }
+        Inst::JmpUnknown { target } => {
+            match target {
+                RM::R { reg } => {
+                    let regEnc = iregEnc(*reg);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        regEnc,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RM::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+            }
+        }
+
+        _ => panic!("x64_emit: unhandled: {} ", inst.show_rru(None)),
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
--- a/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
@ -0,0 +1,905 @@
+//! This module defines x86_64-specific machine instruction types.
+
+#![allow(dead_code)]
+#![allow(non_snake_case)]
+#![allow(non_camel_case_types)]
+
+use core::convert::TryFrom;
+use smallvec::SmallVec;
+use std::fmt;
+use std::string::{String, ToString};
+
+use regalloc::RegUsageCollector;
+use regalloc::Set;
+use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable};
+
+use crate::binemit::CodeOffset;
+use crate::ir::types::{B1, B128, B16, B32, B64, B8, F32, F64, I128, I16, I32, I64, I8};
+use crate::ir::ExternalName;
+use crate::ir::Type;
+use crate::machinst::*;
+use crate::settings::Flags;
+use crate::{settings, CodegenError, CodegenResult};
+
+pub mod args;
+mod emit;
+#[cfg(test)]
+mod emit_tests;
+pub mod regs;
+
+use args::*;
+use regs::{create_reg_universe_systemv, show_ireg_sized};
+
+//=============================================================================
+// Instructions (top level): definition
+
+// Don't build these directly.  Instead use the Inst:: functions to create them.
+
+/// Instructions.  Destinations are on the RIGHT (a la AT&T syntax).
+#[derive(Clone)]
+pub(crate) enum Inst {
+    /// nops of various sizes, including zero
+    Nop { len: u8 },
+
+    /// (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
+    Alu_RMI_R {
+        is_64: bool,
+        op: RMI_R_Op,
+        src: RMI,
+        dst: Writable<Reg>,
+    },
+
+    /// (imm32 imm64) reg.
+    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32
+    Imm_R {
+        dst_is_64: bool,
+        simm64: u64,
+        dst: Writable<Reg>,
+    },
+
+    /// mov (64 32) reg reg
+    Mov_R_R {
+        is_64: bool,
+        src: Reg,
+        dst: Writable<Reg>,
+    },
+
+    /// movz (bl bq wl wq lq) addr reg (good for all ZX loads except 64->64).
+    /// Note that the lq variant doesn't really exist since the default
+    /// zero-extend rule makes it unnecessary.  For that case we emit the
+    /// equivalent "movl AM, reg32".
+    MovZX_M_R {
+        extMode: ExtMode,
+        addr: Addr,
+        dst: Writable<Reg>,
+    },
+
+    /// A plain 64-bit integer load, since MovZX_M_R can't represent that
+    Mov64_M_R { addr: Addr, dst: Writable<Reg> },
+
+    /// movs (bl bq wl wq lq) addr reg (good for all SX loads)
+    MovSX_M_R {
+        extMode: ExtMode,
+        addr: Addr,
+        dst: Writable<Reg>,
+    },
+
+    /// mov (b w l q) reg addr (good for all integer stores)
+    Mov_R_M {
+        size: u8, // 1, 2, 4 or 8
+        src: Reg,
+        addr: Addr,
+    },
+
+    /// (shl shr sar) (l q) imm reg
+    Shift_R {
+        is_64: bool,
+        kind: ShiftKind,
+        /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl".
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    },
+
+    /// cmp (b w l q) (reg addr imm) reg
+    Cmp_RMI_R {
+        size: u8, // 1, 2, 4 or 8
+        src: RMI,
+        dst: Reg,
+    },
+
+    /// pushq (reg addr imm)
+    Push64 { src: RMI },
+
+    /// popq reg
+    Pop64 { dst: Writable<Reg> },
+
+    /// call simm32
+    CallKnown {
+        dest: ExternalName,
+        uses: Set<Reg>,
+        defs: Set<Writable<Reg>>,
+    },
+
+    /// callq (reg mem)
+    CallUnknown {
+        dest: RM,
+        //uses: Set<Reg>,
+        //defs: Set<Writable<Reg>>,
+    },
+
+    // ---- branches (exactly one must appear at end of BB) ----
+    /// ret
+    Ret,
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder,
+
+    /// jmp simm32
+    JmpKnown { dest: BranchTarget },
+
+    /// jcond cond target target
+    /// Symmetrical two-way conditional branch.
+    /// Emitted as a compound sequence; the MachBuffer will shrink it
+    /// as appropriate.
+    JmpCondSymm {
+        cc: CC,
+        taken: BranchTarget,
+        not_taken: BranchTarget,
+    },
+
+    /// jmpq (reg mem)
+    JmpUnknown { target: RM },
+}
+
+// Handy constructors for Insts.
+
+// For various sizes, will some number of lowest bits sign extend to be the
+// same as the whole value?
+pub(crate) fn low32willSXto64(x: u64) -> bool {
+    let xs = x as i64;
+    xs == ((xs << 32) >> 32)
+}
+
+impl Inst {
+    pub(crate) fn nop(len: u8) -> Self {
+        debug_assert!(len <= 16);
+        Self::Nop { len }
+    }
+
+    pub(crate) fn alu_rmi_r(is_64: bool, op: RMI_R_Op, src: RMI, dst: Writable<Reg>) -> Self {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Self::Alu_RMI_R {
+            is_64,
+            op,
+            src,
+            dst,
+        }
+    }
+
+    pub(crate) fn imm_r(dst_is_64: bool, simm64: u64, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        if !dst_is_64 {
+            debug_assert!(low32willSXto64(simm64));
+        }
+        Inst::Imm_R {
+            dst_is_64,
+            simm64,
+            dst,
+        }
+    }
+
+    pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst {
+        debug_assert!(src.get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Mov_R_R { is_64, src, dst }
+    }
+
+    pub(crate) fn movzx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovZX_M_R { extMode, addr, dst }
+    }
+
+    pub(crate) fn mov64_m_r(addr: Addr, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Mov64_M_R { addr, dst }
+    }
+
+    pub(crate) fn movsx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovSX_M_R { extMode, addr, dst }
+    }
+
+    pub(crate) fn mov_r_m(
+        size: u8, // 1, 2, 4 or 8
+        src: Reg,
+        addr: Addr,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(src.get_class() == RegClass::I64);
+        Inst::Mov_R_M { size, src, addr }
+    }
+
+    pub(crate) fn shift_r(
+        is_64: bool,
+        kind: ShiftKind,
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(if let Some(num_bits) = num_bits {
+            num_bits < if is_64 { 64 } else { 32 }
+        } else {
+            true
+        });
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Shift_R {
+            is_64,
+            kind,
+            num_bits,
+            dst,
+        }
+    }
+
+    pub(crate) fn cmp_rmi_r(
+        size: u8, // 1, 2, 4 or 8
+        src: RMI,
+        dst: Reg,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(dst.get_class() == RegClass::I64);
+        Inst::Cmp_RMI_R { size, src, dst }
+    }
+
+    pub(crate) fn push64(src: RMI) -> Inst {
+        Inst::Push64 { src }
+    }
+
+    pub(crate) fn pop64(dst: Writable<Reg>) -> Inst {
+        Inst::Pop64 { dst }
+    }
+
+    pub(crate) fn call_unknown(dest: RM) -> Inst {
+        Inst::CallUnknown { dest }
+    }
+
+    pub(crate) fn ret() -> Inst {
+        Inst::Ret
+    }
+
+    pub(crate) fn epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    pub(crate) fn jmp_known(dest: BranchTarget) -> Inst {
+        Inst::JmpKnown { dest }
+    }
+
+    pub(crate) fn jmp_cond_symm(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
+        Inst::JmpCondSymm {
+            cc,
+            taken,
+            not_taken,
+        }
+    }
+
+    pub(crate) fn jmp_unknown(target: RM) -> Inst {
+        Inst::JmpUnknown { target }
+    }
+}
+
+//=============================================================================
+// Instructions: printing
+
+impl ShowWithRRU for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        fn ljustify(s: String) -> String {
+            let w = 7;
+            if s.len() >= w {
+                s
+            } else {
+                let need = usize::min(w, w - s.len());
+                s + &format!("{nil: <width$}", nil = "", width = need)
+            }
+        }
+
+        fn ljustify2(s1: String, s2: String) -> String {
+            ljustify(s1 + &s2)
+        }
+
+        fn suffixLQ(is_64: bool) -> String {
+            (if is_64 { "q" } else { "l" }).to_string()
+        }
+
+        fn sizeLQ(is_64: bool) -> u8 {
+            if is_64 {
+                8
+            } else {
+                4
+            }
+        }
+
+        fn suffixBWLQ(size: u8) -> String {
+            match size {
+                1 => "b".to_string(),
+                2 => "w".to_string(),
+                4 => "l".to_string(),
+                8 => "q".to_string(),
+                _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size),
+            }
+        }
+
+        match self {
+            Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),
+            Inst::Alu_RMI_R {
+                is_64,
+                op,
+                src,
+                dst,
+            } => format!(
+                "{} {}, {}",
+                ljustify2(op.to_string(), suffixLQ(*is_64)),
+                src.show_rru_sized(mb_rru, sizeLQ(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64)),
+            ),
+            Inst::Imm_R {
+                dst_is_64,
+                simm64,
+                dst,
+            } => {
+                if *dst_is_64 {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movabsq".to_string()),
+                        *simm64 as i64,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 8)
+                    )
+                } else {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movl".to_string()),
+                        (*simm64 as u32) as i32,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                }
+            }
+            Inst::Mov_R_R { is_64, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffixLQ(*is_64)),
+                show_ireg_sized(*src, mb_rru, sizeLQ(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
+            ),
+            Inst::MovZX_M_R { extMode, addr, dst } => {
+                if *extMode == ExtMode::LQ {
+                    format!(
+                        "{} {}, {}",
+                        ljustify("movl".to_string()),
+                        addr.show_rru(mb_rru),
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                } else {
+                    format!(
+                        "{} {}, {}",
+                        ljustify2("movz".to_string(), extMode.to_string()),
+                        addr.show_rru(mb_rru),
+                        show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
+                    )
+                }
+            }
+            Inst::Mov64_M_R { addr, dst } => format!(
+                "{} {}, {}",
+                ljustify("movq".to_string()),
+                addr.show_rru(mb_rru),
+                dst.show_rru(mb_rru)
+            ),
+            Inst::MovSX_M_R { extMode, addr, dst } => format!(
+                "{} {}, {}",
+                ljustify2("movs".to_string(), extMode.to_string()),
+                addr.show_rru(mb_rru),
+                show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
+            ),
+            Inst::Mov_R_M { size, src, addr } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffixBWLQ(*size)),
+                show_ireg_sized(*src, mb_rru, *size),
+                addr.show_rru(mb_rru)
+            ),
+            Inst::Shift_R {
+                is_64,
+                kind,
+                num_bits,
+                dst,
+            } => match num_bits {
+                None => format!(
+                    "{} %cl, {}",
+                    ljustify2(kind.to_string(), suffixLQ(*is_64)),
+                    show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
+                ),
+
+                Some(num_bits) => format!(
+                    "{} ${}, {}",
+                    ljustify2(kind.to_string(), suffixLQ(*is_64)),
+                    num_bits,
+                    show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
+                ),
+            },
+            Inst::Cmp_RMI_R { size, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("cmp".to_string(), suffixBWLQ(*size)),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(*dst, mb_rru, *size)
+            ),
+            Inst::Push64 { src } => {
+                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
+            }
+            Inst::Pop64 { dst } => {
+                format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru))
+            }
+            //Inst::CallKnown { target } => format!("{} {:?}", ljustify("call".to_string()), target),
+            Inst::CallKnown { .. } => "**CallKnown**".to_string(),
+            Inst::CallUnknown { dest } => format!(
+                "{} *{}",
+                ljustify("call".to_string()),
+                dest.show_rru(mb_rru)
+            ),
+            Inst::Ret => "ret".to_string(),
+            Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
+            Inst::JmpKnown { dest } => {
+                format!("{} {}", ljustify("jmp".to_string()), dest.show_rru(mb_rru))
+            }
+            Inst::JmpCondSymm {
+                cc,
+                taken,
+                not_taken,
+            } => format!(
+                "{} taken={} not_taken={}",
+                ljustify2("j".to_string(), cc.to_string()),
+                taken.show_rru(mb_rru),
+                not_taken.show_rru(mb_rru)
+            ),
+            //
+            Inst::JmpUnknown { target } => format!(
+                "{} *{}",
+                ljustify("jmp".to_string()),
+                target.show_rru(mb_rru)
+            ),
+        }
+    }
+}
+
+// Temp hook for legacy printing machinery
+impl fmt::Debug for Inst {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        // Print the insn without a Universe :-(
+        write!(fmt, "{}", self.show_rru(None))
+    }
+}
+
+fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    // This is a bit subtle. If some register is in the modified set, then it may not be in either
+    // the use or def sets. However, enforcing that directly is somewhat difficult. Instead,
+    // regalloc.rs will "fix" this for us by removing the the modified set from the use and def
+    // sets.
+    match inst {
+        // ** Nop
+        Inst::Alu_RMI_R {
+            is_64: _,
+            op: _,
+            src,
+            dst,
+        } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(*dst);
+        }
+        Inst::Imm_R {
+            dst_is_64: _,
+            simm64: _,
+            dst,
+        } => {
+            collector.add_def(*dst);
+        }
+        Inst::Mov_R_R { is_64: _, src, dst } => {
+            collector.add_use(*src);
+            collector.add_def(*dst);
+        }
+        Inst::MovZX_M_R {
+            extMode: _,
+            addr,
+            dst,
+        } => {
+            addr.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::Mov64_M_R { addr, dst } => {
+            addr.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::MovSX_M_R {
+            extMode: _,
+            addr,
+            dst,
+        } => {
+            addr.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::Mov_R_M { size: _, src, addr } => {
+            collector.add_use(*src);
+            addr.get_regs_as_uses(collector);
+        }
+        Inst::Shift_R {
+            is_64: _,
+            kind: _,
+            num_bits,
+            dst,
+        } => {
+            if num_bits.is_none() {
+                collector.add_use(regs::rcx());
+            }
+            collector.add_mod(*dst);
+        }
+        Inst::Cmp_RMI_R { size: _, src, dst } => {
+            src.get_regs_as_uses(collector);
+            collector.add_use(*dst); // yes, really `add_use`
+        }
+        Inst::Push64 { src } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(Writable::from_reg(regs::rsp()));
+        }
+        Inst::Pop64 { dst } => {
+            collector.add_def(*dst);
+        }
+        Inst::CallKnown {
+            dest: _,
+            uses: _,
+            defs: _,
+        } => {
+            // FIXME add arg regs (iru.used) and caller-saved regs (iru.defined)
+            unimplemented!();
+        }
+        Inst::CallUnknown { dest } => {
+            dest.get_regs_as_uses(collector);
+        }
+        Inst::Ret => {}
+        Inst::EpiloguePlaceholder => {}
+        Inst::JmpKnown { dest: _ } => {}
+        Inst::JmpCondSymm {
+            cc: _,
+            taken: _,
+            not_taken: _,
+        } => {}
+        //Inst::JmpUnknown { target } => {
+        //    target.get_regs_as_uses(collector);
+        //}
+        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_get_regs inst"),
+    }
+}
+
+//=============================================================================
+// Instructions and subcomponents: map_regs
+
+fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
+    if r.is_virtual() {
+        let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg();
+        *r = new;
+    }
+}
+
+fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if r.to_reg().is_virtual() {
+        let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if r.to_reg().is_virtual() {
+        let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+impl Addr {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            Addr::IR {
+                simm32: _,
+                ref mut base,
+            } => map_use(map, base),
+            Addr::IRRS {
+                simm32: _,
+                ref mut base,
+                ref mut index,
+                shift: _,
+            } => {
+                map_use(map, base);
+                map_use(map, index);
+            }
+        }
+    }
+}
+
+impl RMI {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RMI::R { ref mut reg } => map_use(map, reg),
+            RMI::M { ref mut addr } => addr.map_uses(map),
+            RMI::I { simm32: _ } => {}
+        }
+    }
+}
+
+impl RM {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RM::R { ref mut reg } => map_use(map, reg),
+            RM::M { ref mut addr } => addr.map_uses(map),
+        }
+    }
+}
+
+fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
+    // Note this must be carefully synchronized with x64_get_regs.
+    match inst {
+        // ** Nop
+        Inst::Alu_RMI_R {
+            is_64: _,
+            op: _,
+            ref mut src,
+            ref mut dst,
+        } => {
+            src.map_uses(mapper);
+            map_mod(mapper, dst);
+        }
+        Inst::Imm_R {
+            dst_is_64: _,
+            simm64: _,
+            ref mut dst,
+        } => map_def(mapper, dst),
+        Inst::Mov_R_R {
+            is_64: _,
+            ref mut src,
+            ref mut dst,
+        } => {
+            map_use(mapper, src);
+            map_def(mapper, dst);
+        }
+        Inst::MovZX_M_R {
+            extMode: _,
+            ref mut addr,
+            ref mut dst,
+        } => {
+            addr.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::Mov64_M_R { addr, dst } => {
+            addr.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::MovSX_M_R {
+            extMode: _,
+            ref mut addr,
+            ref mut dst,
+        } => {
+            addr.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::Mov_R_M {
+            size: _,
+            ref mut src,
+            ref mut addr,
+        } => {
+            map_use(mapper, src);
+            addr.map_uses(mapper);
+        }
+        Inst::Shift_R {
+            is_64: _,
+            kind: _,
+            num_bits: _,
+            ref mut dst,
+        } => {
+            map_mod(mapper, dst);
+        }
+        Inst::Cmp_RMI_R {
+            size: _,
+            ref mut src,
+            ref mut dst,
+        } => {
+            src.map_uses(mapper);
+            map_use(mapper, dst);
+        }
+        Inst::Push64 { ref mut src } => src.map_uses(mapper),
+        Inst::Pop64 { ref mut dst } => {
+            map_def(mapper, dst);
+        }
+        Inst::CallKnown {
+            dest: _,
+            uses: _,
+            defs: _,
+        } => {}
+        Inst::CallUnknown { dest } => dest.map_uses(mapper),
+        Inst::Ret => {}
+        Inst::EpiloguePlaceholder => {}
+        Inst::JmpKnown { dest: _ } => {}
+        Inst::JmpCondSymm {
+            cc: _,
+            taken: _,
+            not_taken: _,
+        } => {}
+        //Inst::JmpUnknown { target } => {
+        //    target.apply_map(mapper);
+        //}
+        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_map_regs opcode"),
+    }
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        x64_get_regs(&self, collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        x64_map_regs(self, mapper);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes
+        // out the upper 32 bits of the destination.  For example, we could
+        // conceivably use `movl %reg, %reg` to zero out the top 32 bits of
+        // %reg.
+        match self {
+            Self::Mov_R_R { is_64, src, dst } if *is_64 => Some((*dst, *src)),
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Self::EpiloguePlaceholder = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            // Interesting cases.
+            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
+            &Self::JmpKnown { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
+            &Self::JmpCondSymm {
+                cc: _,
+                taken,
+                not_taken,
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            // All other cases are boring.
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, _ty: Type) -> Inst {
+        let rc_dst = dst_reg.to_reg().get_class();
+        let rc_src = src_reg.get_class();
+        // If this isn't true, we have gone way off the rails.
+        debug_assert!(rc_dst == rc_src);
+        match rc_dst {
+            RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg),
+            _ => panic!("gen_move(x64): unhandled regclass"),
+        }
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        unimplemented!()
+    }
+
+    fn gen_nop(_preferred_size: usize) -> Inst {
+        unimplemented!()
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+        match ty {
+            I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64),
+            F32 | F64 | I128 | B128 => Ok(RegClass::V128),
+            _ => Err(CodegenError::Unsupported(format!(
+                "Unexpected SSA-value type: {}",
+                ty
+            ))),
+        }
+    }
+
+    fn gen_jump(label: MachLabel) -> Inst {
+        Inst::jmp_known(BranchTarget::Label(label))
+    }
+
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, _: Type) -> SmallVec<[Self; 4]> {
+        let mut ret = SmallVec::new();
+        let is64 = value > 0xffff_ffff;
+        ret.push(Inst::imm_r(is64, value, to_reg));
+        ret
+    }
+
+    fn reg_universe(flags: &Flags) -> RealRegUniverse {
+        create_reg_universe_systemv(flags)
+    }
+
+    fn worst_case_size() -> CodeOffset {
+        15
+    }
+
+    type LabelUse = LabelUse;
+}
+
+impl MachInstEmit for Inst {
+    type State = ();
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, _flags: &settings::Flags, _: &mut Self::State) {
+        emit::emit(self, sink);
+    }
+}
+
+/// A label-use (internal relocation) in generated code.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum LabelUse {
+    /// A 32-bit offset from location of relocation itself, added to the
+    /// existing value at that location.
+    Rel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    const ALIGN: CodeOffset = 1;
+
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0x7fff_ffff,
+        }
+    }
+
+    fn max_neg_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0x8000_0000,
+        }
+    }
+
+    fn patch_size(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 4,
+        }
+    }
+
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        match self {
+            LabelUse::Rel32 => {
+                let addend = i32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = i32::try_from(label_offset)
+                    .unwrap()
+                    .wrapping_sub(i32::try_from(use_offset).unwrap())
+                    .wrapping_add(addend);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+        }
+    }
+
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Rel32 => false,
+        }
+    }
+
+    fn veneer_size(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0,
+        }
+    }
+
+    fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Rel32 => {
+                panic!("Veneer not supported for Rel32 label-use.");
+            }
+        }
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
@ -0,0 +1,261 @@
+//! Registers, the Universe thereof, and printing.
+//!
+//! These are ordered by sequence number, as required in the Universe.  The strange ordering is
+//! intended to make callee-save registers available before caller-saved ones.  This is a net win
+//! provided that each function makes at least one onward call.  It'll be a net loss for leaf
+//! functions, and we should change the ordering in that case, so as to make caller-save regs
+//! available first.
+//!
+//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions?
+//! Also, they will have to be ABI dependent.  Need to find a way to avoid constructing a universe
+//! for each function we compile.
+
+use alloc::vec::Vec;
+use std::string::String;
+
+use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES};
+
+use crate::machinst::pretty_print::ShowWithRRU;
+use crate::settings;
+
+// Hardware encodings for a few registers.
+
+pub const ENC_RBX: u8 = 3;
+pub const ENC_RSP: u8 = 4;
+pub const ENC_RBP: u8 = 5;
+pub const ENC_R12: u8 = 12;
+pub const ENC_R13: u8 = 13;
+pub const ENC_R14: u8 = 14;
+pub const ENC_R15: u8 = 15;
+
+fn gpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::I64, enc, index)
+}
+
+pub(crate) fn r12() -> Reg {
+    gpr(ENC_R12, 0)
+}
+pub(crate) fn r13() -> Reg {
+    gpr(ENC_R13, 1)
+}
+pub(crate) fn r14() -> Reg {
+    gpr(ENC_R14, 2)
+}
+pub(crate) fn r15() -> Reg {
+    gpr(ENC_R15, 3)
+}
+pub(crate) fn rbx() -> Reg {
+    gpr(ENC_RBX, 4)
+}
+pub(crate) fn rsi() -> Reg {
+    gpr(6, 5)
+}
+pub(crate) fn rdi() -> Reg {
+    gpr(7, 6)
+}
+pub(crate) fn rax() -> Reg {
+    gpr(0, 7)
+}
+pub(crate) fn rcx() -> Reg {
+    gpr(1, 8)
+}
+pub(crate) fn rdx() -> Reg {
+    gpr(2, 9)
+}
+pub(crate) fn r8() -> Reg {
+    gpr(8, 10)
+}
+pub(crate) fn r9() -> Reg {
+    gpr(9, 11)
+}
+pub(crate) fn r10() -> Reg {
+    gpr(10, 12)
+}
+pub(crate) fn r11() -> Reg {
+    gpr(11, 13)
+}
+
+fn fpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::V128, enc, index)
+}
+fn xmm0() -> Reg {
+    fpr(0, 14)
+}
+fn xmm1() -> Reg {
+    fpr(1, 15)
+}
+fn xmm2() -> Reg {
+    fpr(2, 16)
+}
+fn xmm3() -> Reg {
+    fpr(3, 17)
+}
+fn xmm4() -> Reg {
+    fpr(4, 18)
+}
+fn xmm5() -> Reg {
+    fpr(5, 19)
+}
+fn xmm6() -> Reg {
+    fpr(6, 20)
+}
+fn xmm7() -> Reg {
+    fpr(7, 21)
+}
+fn xmm8() -> Reg {
+    fpr(8, 22)
+}
+fn xmm9() -> Reg {
+    fpr(9, 23)
+}
+fn xmm10() -> Reg {
+    fpr(10, 24)
+}
+fn xmm11() -> Reg {
+    fpr(11, 25)
+}
+fn xmm12() -> Reg {
+    fpr(12, 26)
+}
+fn xmm13() -> Reg {
+    fpr(13, 27)
+}
+fn xmm14() -> Reg {
+    fpr(14, 28)
+}
+fn xmm15() -> Reg {
+    fpr(15, 29)
+}
+
+pub(crate) fn rsp() -> Reg {
+    gpr(ENC_RSP, 30)
+}
+pub(crate) fn rbp() -> Reg {
+    gpr(ENC_RBP, 31)
+}
+
+/// Create the register universe for X64.
+///
+/// The ordering of registers matters, as commented in the file doc comment: assumes the
+/// calling-convention is SystemV, at the moment.
+pub(crate) fn create_reg_universe_systemv(_flags: &settings::Flags) -> RealRegUniverse {
+    let mut regs = Vec::<(RealReg, String)>::new();
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    // Integer regs.
+    let mut base = regs.len();
+
+    // Callee-saved, in the SystemV x86_64 ABI.
+    regs.push((r12().to_real_reg(), "%r12".into()));
+    regs.push((r13().to_real_reg(), "%r13".into()));
+    regs.push((r14().to_real_reg(), "%r14".into()));
+    regs.push((r15().to_real_reg(), "%r15".into()));
+    regs.push((rbx().to_real_reg(), "%rbx".into()));
+
+    // Caller-saved, in the SystemV x86_64 ABI.
+    regs.push((rsi().to_real_reg(), "%rsi".into()));
+    regs.push((rdi().to_real_reg(), "%rdi".into()));
+    regs.push((rax().to_real_reg(), "%rax".into()));
+    regs.push((rcx().to_real_reg(), "%rcx".into()));
+    regs.push((rdx().to_real_reg(), "%rdx".into()));
+    regs.push((r8().to_real_reg(), "%r8".into()));
+    regs.push((r9().to_real_reg(), "%r9".into()));
+    regs.push((r10().to_real_reg(), "%r10".into()));
+    regs.push((r11().to_real_reg(), "%r11".into()));
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: base,
+        last: regs.len() - 1,
+        suggested_scratch: Some(r12().get_index()),
+    });
+
+    // XMM registers
+    base = regs.len();
+    regs.push((xmm0().to_real_reg(), "%xmm0".into()));
+    regs.push((xmm1().to_real_reg(), "%xmm1".into()));
+    regs.push((xmm2().to_real_reg(), "%xmm2".into()));
+    regs.push((xmm3().to_real_reg(), "%xmm3".into()));
+    regs.push((xmm4().to_real_reg(), "%xmm4".into()));
+    regs.push((xmm5().to_real_reg(), "%xmm5".into()));
+    regs.push((xmm6().to_real_reg(), "%xmm6".into()));
+    regs.push((xmm7().to_real_reg(), "%xmm7".into()));
+    regs.push((xmm8().to_real_reg(), "%xmm8".into()));
+    regs.push((xmm9().to_real_reg(), "%xmm9".into()));
+    regs.push((xmm10().to_real_reg(), "%xmm10".into()));
+    regs.push((xmm11().to_real_reg(), "%xmm11".into()));
+    regs.push((xmm12().to_real_reg(), "%xmm12".into()));
+    regs.push((xmm13().to_real_reg(), "%xmm13".into()));
+    regs.push((xmm14().to_real_reg(), "%xmm14".into()));
+    regs.push((xmm15().to_real_reg(), "%xmm15".into()));
+
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: base,
+        last: regs.len() - 1,
+        suggested_scratch: Some(xmm15().get_index()),
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = regs.len();
+    regs.push((rsp().to_real_reg(), "%rsp".into()));
+    regs.push((rbp().to_real_reg(), "%rbp".into()));
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some
+/// smaller size (4, 2 or 1 bytes).
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+    let mut s = reg.show_rru(mb_rru);
+
+    if reg.get_class() != RegClass::I64 || size == 8 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "rax" into "eax", "ax" or "al" as appropriate.  This is something one could
+        // describe diplomatically as "a kludge", but it's only debug code.
+        let remapper = match s.as_str() {
+            "%rax" => Some(["%eax", "%ax", "%al"]),
+            "%rbx" => Some(["%ebx", "%bx", "%bl"]),
+            "%rcx" => Some(["%ecx", "%cx", "%cl"]),
+            "%rdx" => Some(["%edx", "%dx", "%dl"]),
+            "%rsi" => Some(["%esi", "%si", "%sil"]),
+            "%rdi" => Some(["%edi", "%di", "%dil"]),
+            "%rbp" => Some(["%ebp", "%bp", "%bpl"]),
+            "%rsp" => Some(["%esp", "%sp", "%spl"]),
+            "%r8" => Some(["%r8d", "%r8w", "%r8b"]),
+            "%r9" => Some(["%r9d", "%r9w", "%r9b"]),
+            "%r10" => Some(["%r10d", "%r10w", "%r10b"]),
+            "%r11" => Some(["%r11d", "%r11w", "%r11b"]),
+            "%r12" => Some(["%r12d", "%r12w", "%r12b"]),
+            "%r13" => Some(["%r13d", "%r13w", "%r13b"]),
+            "%r14" => Some(["%r14d", "%r14w", "%r14b"]),
+            "%r15" => Some(["%r15d", "%r15w", "%r15b"]),
+            _ => None,
+        };
+        if let Some(smaller_names) = remapper {
+            match size {
+                4 => s = smaller_names[0].into(),
+                2 => s = smaller_names[1].into(),
+                1 => s = smaller_names[2].into(),
+                _ => panic!("show_ireg_sized: real"),
+            }
+        }
+    } else {
+        // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths.
+        let suffix = match size {
+            4 => "l",
+            2 => "w",
+            1 => "b",
+            _ => panic!("show_ireg_sized: virtual"),
+        };
+        s = s + suffix;
+    }
+
+    s
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
@ -0,0 +1,343 @@
+//! Lowering rules for X64.
+
+#![allow(dead_code)]
+#![allow(non_snake_case)]
+
+use regalloc::{Reg, Writable};
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::types;
+use crate::ir::Inst as IRInst;
+use crate::ir::{InstructionData, Opcode, Type};
+
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::result::CodegenResult;
+
+use crate::isa::x64::inst::args::*;
+use crate::isa::x64::inst::*;
+use crate::isa::x64::X64Backend;
+
+/// Context passed to all lowering functions.
+type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
+
+//=============================================================================
+// Helpers for instruction lowering.
+
+fn is_int_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        _ => false,
+    }
+}
+
+fn int_ty_to_is64(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 => false,
+        types::I64 => true,
+        _ => panic!("type {} is none of I8, I16, I32 or I64", ty),
+    }
+}
+
+fn int_ty_to_sizeB(ty: Type) -> u8 {
+    match ty {
+        types::I8 => 1,
+        types::I16 => 2,
+        types::I32 => 4,
+        types::I64 => 8,
+        _ => panic!("ity_to_sizeB"),
+    }
+}
+
+fn iri_to_u64_immediate<'a>(ctx: Ctx<'a>, iri: IRInst) -> Option<u64> {
+    let inst_data = ctx.data(iri);
+    if inst_data.opcode() == Opcode::Null {
+        Some(0)
+    } else {
+        match inst_data {
+            &InstructionData::UnaryImm { opcode: _, imm } => {
+                // Only has Into for i64; we use u64 elsewhere, so we cast.
+                let imm: i64 = imm.into();
+                Some(imm as u64)
+            }
+            _ => None,
+        }
+    }
+}
+
+fn inst_condcode(data: &InstructionData) -> IntCC {
+    match data {
+        &InstructionData::IntCond { cond, .. }
+        | &InstructionData::BranchIcmp { cond, .. }
+        | &InstructionData::IntCompare { cond, .. }
+        | &InstructionData::IntCondTrap { cond, .. }
+        | &InstructionData::BranchInt { cond, .. }
+        | &InstructionData::IntSelect { cond, .. }
+        | &InstructionData::IntCompareImm { cond, .. } => cond,
+        _ => panic!("inst_condcode(x64): unhandled: {:?}", data),
+    }
+}
+
+fn intCC_to_x64_CC(cc: IntCC) -> CC {
+    match cc {
+        IntCC::Equal => CC::Z,
+        IntCC::NotEqual => CC::NZ,
+        IntCC::SignedGreaterThanOrEqual => CC::NL,
+        IntCC::SignedGreaterThan => CC::NLE,
+        IntCC::SignedLessThanOrEqual => CC::LE,
+        IntCC::SignedLessThan => CC::L,
+        IntCC::UnsignedGreaterThanOrEqual => CC::NB,
+        IntCC::UnsignedGreaterThan => CC::NBE,
+        IntCC::UnsignedLessThanOrEqual => CC::BE,
+        IntCC::UnsignedLessThan => CC::B,
+        IntCC::Overflow => CC::O,
+        IntCC::NotOverflow => CC::NO,
+    }
+}
+
+fn input_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, input: usize) -> Reg {
+    let inputs = ctx.get_input(iri, input);
+    ctx.use_input_reg(inputs);
+    inputs.reg
+}
+
+fn output_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, output: usize) -> Writable<Reg> {
+    ctx.get_output(iri, output)
+}
+
+//=============================================================================
+// Top-level instruction lowering entry point, for one instruction.
+
+/// Actually codegen an instruction's results into registers.
+fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
+    let op = ctx.data(iri).opcode();
+    let ty = if ctx.num_outputs(iri) == 1 {
+        Some(ctx.output_ty(iri, 0))
+    } else {
+        None
+    };
+
+    // This is all outstandingly feeble.  TODO: much better!
+
+    match op {
+        Opcode::Iconst => {
+            if let Some(w64) = iri_to_u64_immediate(ctx, iri) {
+                // Get exactly the bit pattern in 'w64' into the dest.  No
+                // monkeying with sign extension etc.
+                let dstIs64 = w64 > 0xFFFF_FFFF;
+                let regD = output_to_reg(ctx, iri, 0);
+                ctx.emit(Inst::imm_r(dstIs64, w64, regD));
+            } else {
+                unimplemented!();
+            }
+        }
+
+        Opcode::Iadd | Opcode::Isub => {
+            let regD = output_to_reg(ctx, iri, 0);
+            let regL = input_to_reg(ctx, iri, 0);
+            let regR = input_to_reg(ctx, iri, 1);
+            let is64 = int_ty_to_is64(ty.unwrap());
+            let how = if op == Opcode::Iadd {
+                RMI_R_Op::Add
+            } else {
+                RMI_R_Op::Sub
+            };
+            ctx.emit(Inst::mov_r_r(true, regL, regD));
+            ctx.emit(Inst::alu_rmi_r(is64, how, RMI::reg(regR), regD));
+        }
+
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
+            // TODO: implement imm shift value into insn
+            let tySL = ctx.input_ty(iri, 0);
+            let tyD = ctx.output_ty(iri, 0); // should be the same as tySL
+            let regSL = input_to_reg(ctx, iri, 0);
+            let regSR = input_to_reg(ctx, iri, 1);
+            let regD = output_to_reg(ctx, iri, 0);
+            if tyD == tySL && (tyD == types::I32 || tyD == types::I64) {
+                let how = match op {
+                    Opcode::Ishl => ShiftKind::Left,
+                    Opcode::Ushr => ShiftKind::RightZ,
+                    Opcode::Sshr => ShiftKind::RightS,
+                    _ => unreachable!(),
+                };
+                let is64 = tyD == types::I64;
+                let r_rcx = regs::rcx();
+                let w_rcx = Writable::<Reg>::from_reg(r_rcx);
+                ctx.emit(Inst::mov_r_r(true, regSL, regD));
+                ctx.emit(Inst::mov_r_r(true, regSR, w_rcx));
+                ctx.emit(Inst::shift_r(is64, how, None /*%cl*/, regD));
+            } else {
+                unimplemented!()
+            }
+        }
+
+        Opcode::Uextend | Opcode::Sextend => {
+            // TODO: this is all extremely lame, all because Mov{ZX,SX}_M_R
+            // don't accept a register source operand.  They should be changed
+            // so as to have _RM_R form.
+            // TODO2: if the source operand is a load, incorporate that.
+            let isZX = op == Opcode::Uextend;
+            let tyS = ctx.input_ty(iri, 0);
+            let tyD = ctx.output_ty(iri, 0);
+            let regS = input_to_reg(ctx, iri, 0);
+            let regD = output_to_reg(ctx, iri, 0);
+            ctx.emit(Inst::mov_r_r(true, regS, regD));
+            match (tyS, tyD, isZX) {
+                (types::I8, types::I64, false) => {
+                    ctx.emit(Inst::shift_r(true, ShiftKind::Left, Some(56), regD));
+                    ctx.emit(Inst::shift_r(true, ShiftKind::RightS, Some(56), regD));
+                }
+                _ => unimplemented!(),
+            }
+        }
+
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for i in 0..ctx.num_inputs(iri) {
+                let src_reg = input_to_reg(ctx, iri, i);
+                let retval_reg = ctx.retval(i);
+                ctx.emit(Inst::mov_r_r(true, src_reg, retval_reg));
+            }
+            // N.B.: the Ret itself is generated by the ABI.
+        }
+
+        Opcode::IaddImm
+        | Opcode::ImulImm
+        | Opcode::UdivImm
+        | Opcode::SdivImm
+        | Opcode::UremImm
+        | Opcode::SremImm
+        | Opcode::IrsubImm
+        | Opcode::IaddCin
+        | Opcode::IaddIfcin
+        | Opcode::IaddCout
+        | Opcode::IaddIfcout
+        | Opcode::IaddCarry
+        | Opcode::IaddIfcarry
+        | Opcode::IsubBin
+        | Opcode::IsubIfbin
+        | Opcode::IsubBout
+        | Opcode::IsubIfbout
+        | Opcode::IsubBorrow
+        | Opcode::IsubIfborrow
+        | Opcode::BandImm
+        | Opcode::BorImm
+        | Opcode::BxorImm
+        | Opcode::RotlImm
+        | Opcode::RotrImm
+        | Opcode::IshlImm
+        | Opcode::UshrImm
+        | Opcode::SshrImm => {
+            panic!("ALU+imm and ALU+carry ops should not appear here!");
+        }
+
+        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
+    }
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for X64Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_insn_to_regs(ctx, ir_inst);
+        Ok(())
+    }
+
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        // A block should end with at most two branches. The first may be a
+        // conditional branch; a conditional branch can be followed only by an
+        // unconditional branch or fallthrough. Otherwise, if only one branch,
+        // it may be an unconditional branch, a fallthrough, a return, or a
+        // trap. These conditions are verified by `is_ebb_basic()` during the
+        // verifier pass.
+        assert!(branches.len() <= 2);
+
+        let mut unimplemented = false;
+
+        if branches.len() == 2 {
+            // Must be a conditional branch followed by an unconditional branch.
+            let op0 = ctx.data(branches[0]).opcode();
+            let op1 = ctx.data(branches[1]).opcode();
+
+            println!(
+                "QQQQ lowering two-branch group: opcodes are {:?} and {:?}",
+                op0, op1
+            );
+
+            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+            let taken = BranchTarget::Label(targets[0]);
+            let not_taken = match op1 {
+                Opcode::Jump => BranchTarget::Label(targets[1]),
+                Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
+                _ => unreachable!(), // assert above.
+            };
+            match op0 {
+                Opcode::Brz | Opcode::Brnz => {
+                    let tyS = ctx.input_ty(branches[0], 0);
+                    if is_int_ty(tyS) {
+                        let rS = input_to_reg(ctx, branches[0], 0);
+                        let cc = match op0 {
+                            Opcode::Brz => CC::Z,
+                            Opcode::Brnz => CC::NZ,
+                            _ => unreachable!(),
+                        };
+                        let sizeB = int_ty_to_sizeB(tyS);
+                        ctx.emit(Inst::cmp_rmi_r(sizeB, RMI::imm(0), rS));
+                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
+                    } else {
+                        unimplemented = true;
+                    }
+                }
+                Opcode::BrIcmp => {
+                    let tyS = ctx.input_ty(branches[0], 0);
+                    if is_int_ty(tyS) {
+                        let rSL = input_to_reg(ctx, branches[0], 0);
+                        let rSR = input_to_reg(ctx, branches[0], 1);
+                        let cc = intCC_to_x64_CC(inst_condcode(ctx.data(branches[0])));
+                        let sizeB = int_ty_to_sizeB(tyS);
+                        // FIXME verify rSR vs rSL ordering
+                        ctx.emit(Inst::cmp_rmi_r(sizeB, RMI::reg(rSR), rSL));
+                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
+                    } else {
+                        unimplemented = true;
+                    }
+                }
+                // TODO: Brif/icmp, Brff/icmp, jump tables
+                _ => {
+                    unimplemented = true;
+                }
+            }
+        } else {
+            assert!(branches.len() == 1);
+
+            // Must be an unconditional branch or trap.
+            let op = ctx.data(branches[0]).opcode();
+            match op {
+                Opcode::Jump => {
+                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
+                }
+                Opcode::Fallthrough => {
+                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
+                }
+                Opcode::Trap => {
+                    unimplemented = true;
+                }
+                _ => panic!("Unknown branch type!"),
+            }
+        }
+
+        if unimplemented {
+            unimplemented!("lower_branch_group(x64): can't handle: {:?}", branches);
+        }
+
+        Ok(())
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
@ -0,0 +1,112 @@
+//! X86_64-bit Instruction Set Architecture.
+
+use alloc::boxed::Box;
+
+use regalloc::RealRegUniverse;
+use target_lexicon::Triple;
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::Function;
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::pretty_print::ShowWithRRU;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings::{self, Flags};
+
+use crate::isa::x64::inst::regs::create_reg_universe_systemv;
+
+mod abi;
+mod inst;
+mod lower;
+
+/// An X64 backend.
+pub(crate) struct X64Backend {
+    triple: Triple,
+    flags: Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl X64Backend {
+    /// Create a new X64 backend with the given (shared) flags.
+    fn new_with_flags(triple: Triple, flags: Flags) -> Self {
+        let reg_universe = create_reg_universe_systemv(&flags);
+        Self {
+            triple,
+            flags,
+            reg_universe,
+        }
+    }
+
+    fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let abi = Box::new(abi::X64ABIBody::new(&func, flags));
+        compile::compile::<Self>(&func, self, abi)
+    }
+}
+
+impl MachBackend for X64Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+        let buffer = vcode.emit();
+        let buffer = buffer.finish();
+        let frame_size = vcode.frame_size();
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags))))
+        } else {
+            None
+        };
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+        })
+    }
+
+    fn flags(&self) -> &Flags {
+        &self.flags
+    }
+
+    fn name(&self) -> &'static str {
+        "x64"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // overflow of an add.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // underflow of a subtract (carry is borrow for subtract).
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: |triple: Triple, flags: Flags, _arch_flag_builder: settings::Builder| {
+            let backend = X64Backend::new_with_flags(triple, flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs
@ -6,7 +6,6 @@ use super::settings as isa_settings;
 use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
 use crate::cursor::{Cursor, CursorPosition, EncCursor};
 use crate::ir;
-use crate::ir::entities::StackSlot;
 use crate::ir::immediates::Imm64;
 use crate::ir::stackslot::{StackOffset, StackSize};
 use crate::ir::types;
@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
 use crate::result::CodegenResult;
 use crate::stack_layout::layout_stack;
 use alloc::borrow::Cow;
-use alloc::vec::Vec;
 use core::i32;
 use target_lexicon::{PointerWidth, Triple};

@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
 ///
 /// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
 /// convention reserves spill space for parameters, you don’t have to use them as such"
-const WIN_SHADOW_STACK_SPACE: i32 = 32;
+const WIN_SHADOW_STACK_SPACE: StackSize = 32;

 /// Stack alignment requirement for functions.
 ///
@ -72,6 +70,7 @@ struct Args {
    shared_flags: shared_settings::Flags,
    #[allow(dead_code)]
    isa_flags: isa_settings::Flags,
+    assigning_returns: bool,
 }

 impl Args {
@ -82,12 +81,13 @@ impl Args {
        call_conv: CallConv,
        shared_flags: &shared_settings::Flags,
        isa_flags: &isa_settings::Flags,
+        assigning_returns: bool,
    ) -> Self {
        let offset = if call_conv.extends_windows_fastcall() {
            WIN_SHADOW_STACK_SPACE
        } else {
            0
-        } as u32;
+        };

        Self {
            pointer_bytes: bits / 8,
@ -101,6 +101,7 @@ impl Args {
            call_conv,
            shared_flags: shared_flags.clone(),
            isa_flags: isa_flags.clone(),
+            assigning_returns,
        }
    }
 }
@ -109,6 +110,17 @@ impl ArgAssigner for Args {
    fn assign(&mut self, arg: &AbiParam) -> ArgAction {
        let ty = arg.value_type;

+        if ty.bits() > u16::from(self.pointer_bits) {
+            if !self.assigning_returns && self.call_conv.extends_windows_fastcall() {
+                // "Any argument that doesn't fit in 8 bytes, or isn't
+                // 1, 2, 4, or 8 bytes, must be passed by reference"
+                return ValueConversion::Pointer(self.pointer_type).into();
+            } else if !ty.is_vector() && !ty.is_float() {
+                // On SystemV large integers and booleans are broken down to fit in a register.
+                return ValueConversion::IntSplit.into();
+            }
+        }
+
        // Vectors should stay in vector registers unless SIMD is not enabled--then they are split
        if ty.is_vector() {
            if self.shared_flags.enable_simd() {
@ -119,11 +131,6 @@ impl ArgAssigner for Args {
            return ValueConversion::VectorSplit.into();
        }

-        // Large integers and booleans are broken down to fit in a register.
-        if !ty.is_float() && ty.bits() > u16::from(self.pointer_bits) {
-            return ValueConversion::IntSplit.into();
-        }
-
        // Small integers are extended to the size of a pointer register.
        if ty.is_int() && ty.bits() < u16::from(self.pointer_bits) {
            match arg.extension {
@ -205,7 +212,7 @@ pub fn legalize_signature(
        PointerWidth::U16 => panic!(),
        PointerWidth::U32 => {
            bits = 32;
-            args = Args::new(bits, &[], 0, sig.call_conv, shared_flags, isa_flags);
+            args = Args::new(bits, &[], 0, sig.call_conv, shared_flags, isa_flags, false);
        }
        PointerWidth::U64 => {
            bits = 64;
@ -217,6 +224,7 @@ pub fn legalize_signature(
                    sig.call_conv,
                    shared_flags,
                    isa_flags,
+                    false,
                )
            } else {
                Args::new(
@ -226,6 +234,7 @@ pub fn legalize_signature(
                    sig.call_conv,
                    shared_flags,
                    isa_flags,
+                    false,
                )
            };
        }
@ -245,26 +254,20 @@ pub fn legalize_signature(
        sig.call_conv,
        shared_flags,
        isa_flags,
+        true,
    );

-    let sig_is_multi_return = sig.is_multi_return();
-
-    // If this is a multi-value return and we don't have enough available return
-    // registers to fit all of the return values, we need to backtrack and start
+    // If we don't have enough available return registers
+    // to fit all of the return values, we need to backtrack and start
    // assigning locations all over again with a different strategy. In order to
    // do that, we need a copy of the original assigner for the returns.
-    let backup_rets_for_struct_return = if sig_is_multi_return {
-        Some(rets.clone())
-    } else {
-        None
-    };
+    let mut backup_rets = rets.clone();

    if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
-        if sig.is_multi_return()
-            && new_returns
-                .iter()
-                .filter(|r| r.purpose == ArgumentPurpose::Normal)
-                .any(|r| !r.location.is_reg())
+        if new_returns
+            .iter()
+            .filter(|r| r.purpose == ArgumentPurpose::Normal)
+            .any(|r| !r.location.is_reg())
        {
            // The return values couldn't all fit into available return
            // registers. Introduce the use of a struct-return parameter.
@ -276,6 +279,7 @@ pub fn legalize_signature(
                purpose: ArgumentPurpose::StructReturn,
                extension: ArgumentExtension::None,
                location: ArgumentLoc::Unassigned,
+                legalized_to_pointer: false,
            };
            match args.assign(&ret_ptr_param) {
                ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
@ -285,8 +289,6 @@ pub fn legalize_signature(
                _ => unreachable!("return pointer should always get a register assignment"),
            }

-            let mut backup_rets = backup_rets_for_struct_return.unwrap();
-
            // We're using the first return register for the return pointer (like
            // sys v does).
            let mut ret_ptr_return = AbiParam {
@ -294,6 +296,7 @@ pub fn legalize_signature(
                purpose: ArgumentPurpose::StructReturn,
                extension: ArgumentExtension::None,
                location: ArgumentLoc::Unassigned,
+                legalized_to_pointer: false,
            };
            match backup_rets.assign(&ret_ptr_return) {
                ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
@ -501,7 +504,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->

    let word_size = StackSize::from(isa.pointer_bytes());
    let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
-        WIN_SHADOW_STACK_SPACE as u32
+        WIN_SHADOW_STACK_SPACE
    } else {
        0
    };
@ -525,50 +528,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
        panic!("TODO: windows-fastcall: x86-32 not implemented yet");
    }

-    let csrs = callee_saved_regs_used(isa, func);
-
    // The reserved stack area is composed of:
-    //   return address + frame pointer + all callee-saved registers + shadow space
+    //   return address + frame pointer + all callee-saved registers
    //
    // Pushing the return address is an implicit function of the `call`
    // instruction. Each of the others we will then push explicitly. Then we
    // will adjust the stack pointer to make room for the rest of the required
    // space for this frame.
-    let word_size = isa.pointer_bytes() as usize;
-    let num_fprs = csrs.iter(FPR).len();
-    let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
+    let csrs = callee_saved_regs_used(isa, func);
+    let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
+    let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
+    let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;

-    // Only create an FPR stack slot if we're going to save FPRs.
-    let fpr_slot = if num_fprs > 0 {
-        // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
-        // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
-        // through `stack_load` and `stack_store` (see later comment about issue #1198). Even
-        // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
-        // implies that `SpillSlot` may be eligible for certain optimizations, and we know with
-        // certainty that this space may not be reused in the function, nor moved around.
-        Some(func.create_stack_slot(ir::StackSlotData {
-            kind: ir::StackSlotKind::ExplicitSlot,
-            size: (num_fprs * types::F64X2.bytes() as usize) as u32,
-            offset: None,
-        }))
-    } else {
-        None
-    };
+    // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
+    if fpsr_stack_size > 0 {
+        csr_stack_size = (csr_stack_size + 15) & !15;
+    }

-    // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
-    //       since cranelift does not support spill slots before incoming args
    func.create_stack_slot(ir::StackSlotData {
        kind: ir::StackSlotKind::IncomingArg,
-        size: csr_stack_size as u32,
-        offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
+        size: csr_stack_size,
+        offset: Some(-(csr_stack_size as StackOffset)),
    });

    let is_leaf = func.is_leaf();
+
+    // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
+    if !is_leaf {
+        // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
+        func.create_stack_slot(ir::StackSlotData {
+            kind: ir::StackSlotKind::ExplicitSlot,
+            size: WIN_SHADOW_STACK_SPACE,
+            offset: None,
+        });
+    }
+
    let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
-    let local_stack_size = i64::from(total_stack_size - csr_stack_size);
+
+    // Subtract the GPR saved register size from the local size because pushes are used for the saves
+    let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);

    // Add CSRs to function signature
    let reg_type = isa.pointer_type();
+    let sp_arg_index = if fpsr_stack_size > 0 {
+        let sp_arg = ir::AbiParam::special_reg(
+            reg_type,
+            ir::ArgumentPurpose::CalleeSaved,
+            RU::rsp as RegUnit,
+        );
+        let index = func.signature.params.len();
+        func.signature.params.push(sp_arg);
+        Some(index)
+    } else {
+        None
+    };
    let fp_arg = ir::AbiParam::special_reg(
        reg_type,
        ir::ArgumentPurpose::FramePointer,
@ -601,19 +614,13 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
        local_stack_size,
        reg_type,
        &csrs,
-        fpr_slot.as_ref(),
+        sp_arg_index.is_some(),
        isa,
    );

    // Reset the cursor and insert the epilogue
    let mut pos = pos.at_position(CursorPosition::Nowhere);
-    insert_common_epilogues(
-        &mut pos,
-        local_stack_size,
-        reg_type,
-        &csrs,
-        fpr_slot.as_ref(),
-    );
+    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);

    Ok(())
 }
@ -649,6 +656,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C

    // Add CSRs to function signature
    let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
+    // On X86-32 all parameters, including vmctx, are passed on stack, and we need
+    // to extract vmctx from the stack before we can save the frame pointer.
+    let sp_arg_index = if isa.pointer_bits() == 32 {
+        let sp_arg = ir::AbiParam::special_reg(
+            reg_type,
+            ir::ArgumentPurpose::CalleeSaved,
+            RU::rsp as RegUnit,
+        );
+        let index = func.signature.params.len();
+        func.signature.params.push(sp_arg);
+        Some(index)
+    } else {
+        None
+    };
    let fp_arg = ir::AbiParam::special_reg(
        reg_type,
        ir::ArgumentPurpose::FramePointer,
@ -666,11 +687,18 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
    // Set up the cursor and insert the prologue
    let entry_block = func.layout.entry_block().expect("missing entry block");
    let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
-    insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
+    insert_common_prologue(
+        &mut pos,
+        local_stack_size,
+        reg_type,
+        &csrs,
+        sp_arg_index.is_some(),
+        isa,
+    );

    // Reset the cursor and insert the epilogue
    let mut pos = pos.at_position(CursorPosition::Nowhere);
-    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
+    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);

    Ok(())
 }
@ -682,9 +710,18 @@ fn insert_common_prologue(
    stack_size: i64,
    reg_type: ir::types::Type,
    csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    has_sp_param: bool,
    isa: &dyn TargetIsa,
 ) {
+    let sp = if has_sp_param {
+        let block = pos.current_block().expect("missing block under cursor");
+        let sp = pos.func.dfg.append_block_param(block, reg_type);
+        pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
+        Some(sp)
+    } else {
+        None
+    };
+
    // If this is a leaf function with zero stack, then there's no need to
    // insert a stack check since it can't overflow anything and
    // forward-progress is guarantee so long as loop are handled anyway.
@ -707,7 +744,7 @@ fn insert_common_prologue(
            None => pos
                .func
                .stack_limit
-                .map(|gv| interpret_gv(pos, gv, scratch)),
+                .map(|gv| interpret_gv(pos, gv, sp, scratch)),
        };
        if let Some(stack_limit_arg) = stack_limit_arg {
            insert_stack_check(pos, stack_size, stack_limit_arg);
@ -780,38 +817,27 @@ fn insert_common_prologue(
        }
    }

-    // Now that RSP is prepared for the function, we can use stack slots:
+    // With the stack pointer adjusted, save any callee-saved floating point registers via offset
+    // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
    let mut last_fpr_save = None;
-    if let Some(fpr_slot) = fpr_slot {
-        debug_assert!(csrs.iter(FPR).len() != 0);

-        // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
-        // address. We are well after postopt could run, so load the CSR region base once here,
-        // instead of hoping that the addr/store will be combined later.
-        // See also: https://github.com/bytecodealliance/wasmtime/pull/1198
-        let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
+    for (i, reg) in csrs.iter(FPR).enumerate() {
+        // Append param to entry block
+        let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);

-        // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
-        // function entry.
-        pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
+        // Since regalloc has already run, we must assign a location.
+        pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);

-        let mut fpr_offset = 0;
+        // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+        let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+            + (stack_size % types::F64X2.bytes() as i64);

-        for reg in csrs.iter(FPR) {
-            // Append param to entry Block
-            let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
-
-            // Since regalloc has already run, we must assign a location.
-            pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
-
-            last_fpr_save =
-                Some(
-                    pos.ins()
-                        .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
-                );
-
-            fpr_offset += types::F64X2.bytes() as i32;
-        }
+        last_fpr_save = Some(pos.ins().store(
+            ir::MemFlags::trusted(),
+            csr_arg,
+            sp.expect("FPR save requires SP param"),
+            (stack_size - offset) as i32,
+        ));
    }

    pos.func.prologue_end = Some(
@ -834,19 +860,55 @@ fn insert_common_prologue(
 /// compared to the stack pointer, but currently it serves enough functionality
 /// to get this implemented in `wasmtime` itself. This'll likely get expanded a
 /// bit over time!
-fn interpret_gv(pos: &mut EncCursor, gv: ir::GlobalValue, scratch: ir::ValueLoc) -> ir::Value {
+fn interpret_gv(
+    pos: &mut EncCursor,
+    gv: ir::GlobalValue,
+    sp: Option<ir::Value>,
+    scratch: ir::ValueLoc,
+) -> ir::Value {
    match pos.func.global_values[gv] {
-        ir::GlobalValueData::VMContext => pos
-            .func
-            .special_param(ir::ArgumentPurpose::VMContext)
-            .expect("no vmcontext parameter found"),
+        ir::GlobalValueData::VMContext => {
+            let vmctx_index = pos
+                .func
+                .signature
+                .special_param_index(ir::ArgumentPurpose::VMContext)
+                .expect("no vmcontext parameter found");
+            match pos.func.signature.params[vmctx_index] {
+                AbiParam {
+                    location: ArgumentLoc::Reg(_),
+                    ..
+                } => {
+                    let entry = pos.func.layout.entry_block().unwrap();
+                    pos.func.dfg.block_params(entry)[vmctx_index]
+                }
+                AbiParam {
+                    location: ArgumentLoc::Stack(offset),
+                    value_type,
+                    ..
+                } => {
+                    let offset =
+                        offset + i32::from(pos.isa.pointer_bytes() * (1 + vmctx_index as u8));
+                    // The following access can be marked `trusted` because it is a load of an argument. We
+                    // know it is safe because it was safe to write it in preparing this function call.
+                    let ret =
+                        pos.ins()
+                            .load(value_type, ir::MemFlags::trusted(), sp.unwrap(), offset);
+                    pos.func.locations[ret] = scratch;
+                    return ret;
+                }
+                AbiParam {
+                    location: ArgumentLoc::Unassigned,
+                    ..
+                } => unreachable!(),
+            }
+        }
        ir::GlobalValueData::Load {
            base,
            offset,
            global_type,
            readonly: _,
        } => {
-            let base = interpret_gv(pos, base, scratch);
+            let base = interpret_gv(pos, base, sp, scratch);
            let ret = pos
                .ins()
                .load(global_type, ir::MemFlags::trusted(), base, offset);
@ -911,13 +973,13 @@ fn insert_common_epilogues(
    stack_size: i64,
    reg_type: ir::types::Type,
    csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    sp_arg_index: Option<usize>,
 ) {
    while let Some(block) = pos.next_block() {
        pos.goto_last_inst(block);
        if let Some(inst) = pos.current_inst() {
            if pos.func.dfg[inst].opcode().is_return() {
-                insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
+                insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index);
            }
        }
    }
@ -931,56 +993,8 @@ fn insert_common_epilogue(
    pos: &mut EncCursor,
    reg_type: ir::types::Type,
    csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    sp_arg_index: Option<usize>,
 ) {
-    // Even though instructions to restore FPRs are inserted first, we have to append them after
-    // restored GPRs to satisfy parameter order in the return.
-    let mut restored_fpr_values = Vec::new();
-
-    // Restore FPRs before we move RSP and invalidate stack slots.
-    let mut first_fpr_load = None;
-    if let Some(fpr_slot) = fpr_slot {
-        debug_assert!(csrs.iter(FPR).len() != 0);
-
-        // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
-        // address. We are well after postopt could run, so load the CSR region base once here,
-        // instead of hoping that the addr/store will be combined later.
-        //
-        // See also: https://github.com/bytecodealliance/wasmtime/pull/1198
-        let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
-
-        first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
-
-        // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
-        // function exit.
-        pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
-
-        let mut fpr_offset = 0;
-
-        for reg in csrs.iter(FPR) {
-            let value = pos.ins().load(
-                types::F64X2,
-                ir::MemFlags::trusted(),
-                stack_addr,
-                fpr_offset,
-            );
-            fpr_offset += types::F64X2.bytes() as i32;
-
-            // Unlike GPRs before, we don't need to step back after reach restoration because FPR
-            // restoration is order-insensitive. Furthermore: we want GPR restoration to begin
-            // after FPR restoration, so that stack adjustments occur after we're done relying on
-            // StackSlot validity.
-
-            pos.func.locations[value] = ir::ValueLoc::Reg(reg);
-            restored_fpr_values.push(value);
-        }
-    }
-
-    let mut sp_adjust_inst = None;
-    if stack_size > 0 {
-        sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
-    }
-
    // Insert the pop of the frame pointer
    let fp_pop = pos.ins().x86_pop(reg_type);
    let fp_pop_inst = pos.prev_inst().unwrap();
@ -991,13 +1005,47 @@ fn insert_common_epilogue(
    let mut first_csr_pop_inst = None;
    for reg in csrs.iter(GPR) {
        let csr_pop = pos.ins().x86_pop(reg_type);
-        first_csr_pop_inst = Some(pos.prev_inst().unwrap());
+        first_csr_pop_inst = pos.prev_inst();
+        assert!(first_csr_pop_inst.is_some());
        pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
        pos.func.dfg.append_inst_arg(inst, csr_pop);
    }

-    for value in restored_fpr_values.into_iter() {
-        pos.func.dfg.append_inst_arg(inst, value);
+    // Insert the adjustment of SP
+    let mut sp_adjust_inst = None;
+    if stack_size > 0 {
+        pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
+        sp_adjust_inst = pos.prev_inst();
+        assert!(sp_adjust_inst.is_some());
+    }
+
+    let mut first_fpr_load = None;
+    if let Some(index) = sp_arg_index {
+        let sp = pos
+            .func
+            .dfg
+            .block_params(pos.func.layout.entry_block().unwrap())[index];
+
+        // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
+        for (i, reg) in csrs.iter(FPR).enumerate() {
+            // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+            let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+                + (stack_size % types::F64X2.bytes() as i64);
+
+            let value = pos.ins().load(
+                types::F64X2,
+                ir::MemFlags::trusted(),
+                sp,
+                (stack_size - offset) as i32,
+            );
+
+            first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
+
+            pos.func.locations[value] = ir::ValueLoc::Reg(reg);
+            pos.func.dfg.append_inst_arg(inst, value);
+        }
+    } else {
+        assert!(csrs.iter(FPR).len() == 0);
    }

    pos.func.epilogues_start.push(
--- a/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs
@ -13,6 +13,7 @@ use crate::isa::encoding::base_size;
 use crate::isa::encoding::{Encoding, RecipeSizing};
 use crate::isa::RegUnit;
 use crate::isa::{self, TargetIsa};
+use crate::legalizer::expand_as_libcall;
 use crate::predicates;
 use crate::regalloc::RegDiversions;

@ -246,6 +247,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1(
    sizing.base_size + if needs_rex { 1 } else { 0 }
 }

+/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
+fn size_with_inferred_rex_for_inreg1_inreg2(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
+        || test_input(2, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
 /// Infers whether a dynamic REX prefix will be emitted, based on a single
 /// input register and a single output register.
 fn size_with_inferred_rex_for_inreg0_outreg0(
@ -1181,10 +1196,10 @@ fn convert_extractlane(
    let mut pos = FuncCursor::new(func).at_inst(inst);
    pos.use_srcloc(inst);

-    if let ir::InstructionData::ExtractLane {
+    if let ir::InstructionData::BinaryImm8 {
        opcode: ir::Opcode::Extractlane,
        arg,
-        lane,
+        imm: lane,
    } = pos.func.dfg[inst]
    {
        // NOTE: the following legalization assumes that the upper bits of the XMM register do
@ -1237,10 +1252,10 @@ fn convert_insertlane(
    let mut pos = FuncCursor::new(func).at_inst(inst);
    pos.use_srcloc(inst);

-    if let ir::InstructionData::InsertLane {
+    if let ir::InstructionData::TernaryImm8 {
        opcode: ir::Opcode::Insertlane,
        args: [vector, replacement],
-        lane,
+        imm: lane,
    } = pos.func.dfg[inst]
    {
        let value_type = pos.func.dfg.value_type(vector);
@ -1255,7 +1270,7 @@ fn convert_insertlane(
                    pos.func
                        .dfg
                        .replace(inst)
-                        .x86_insertps(vector, immediate, replacement)
+                        .x86_insertps(vector, replacement, immediate)
                }
                F64X2 => {
                    let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
@ -1283,7 +1298,7 @@ fn convert_insertlane(
            pos.func
                .dfg
                .replace(inst)
-                .x86_pinsr(vector, lane, replacement);
+                .x86_pinsr(vector, replacement, lane);
        }
    }
 }
@ -1318,6 +1333,39 @@ fn convert_ineg(
    }
 }

+fn expand_dword_to_xmm<'f>(
+    pos: &mut FuncCursor<'_>,
+    arg: ir::Value,
+    arg_type: ir::Type,
+) -> ir::Value {
+    if arg_type == I64 {
+        let (arg_lo, arg_hi) = pos.ins().isplit(arg);
+        let arg = pos.ins().scalar_to_vector(I32X4, arg_lo);
+        let arg = pos.ins().insertlane(arg, arg_hi, 1);
+        let arg = pos.ins().raw_bitcast(I64X2, arg);
+        arg
+    } else {
+        pos.ins().bitcast(I64X2, arg)
+    }
+}
+
+fn contract_dword_from_xmm<'f>(
+    pos: &mut FuncCursor<'f>,
+    inst: ir::Inst,
+    ret: ir::Value,
+    ret_type: ir::Type,
+) {
+    if ret_type == I64 {
+        let ret = pos.ins().raw_bitcast(I32X4, ret);
+        let ret_lo = pos.ins().extractlane(ret, 0);
+        let ret_hi = pos.ins().extractlane(ret, 1);
+        pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi);
+    } else {
+        let ret = pos.ins().extractlane(ret, 0);
+        pos.func.dfg.replace(inst).ireduce(ret_type, ret);
+    }
+}
+
 // Masks for i8x16 unsigned right shift.
 static USHR_MASKS: [u8; 128] = [
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@ -1379,7 +1427,24 @@ fn convert_ushr(
        } else if arg0_type.is_vector() {
            // x86 has encodings for these shifts.
            pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.has_sse41() {
+                // if we have pinstrq/pextrq (SSE 4.1), legalize to that
+                let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+                let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+                let shifted = pos.ins().x86_psrl(value, amount);
+                contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
+            } else {
+                // otherwise legalize to libcall
+                expand_as_libcall(inst, func, isa);
+            }
        } else {
+            // Everything else should be already legal.
            unreachable!()
        }
    }
@ -1446,12 +1511,76 @@ fn convert_ishl(
        } else if arg0_type.is_vector() {
            // x86 has encodings for these shifts.
            pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.has_sse41() {
+                // if we have pinstrq/pextrq (SSE 4.1), legalize to that
+                let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+                let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+                let shifted = pos.ins().x86_psll(value, amount);
+                contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
+            } else {
+                // otherwise legalize to libcall
+                expand_as_libcall(inst, func, isa);
+            }
        } else {
+            // Everything else should be already legal.
            unreachable!()
        }
    }
 }

+/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2.
+fn convert_i64x2_imul(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Imul,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        let ty = pos.func.dfg.ctrl_typevar(inst);
+        if ty == I64X2 {
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() {
+                // If we have certain AVX512 features, we can lower this instruction simply.
+                pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1);
+            } else {
+                // Otherwise, we default to a very lengthy SSE2-compatible sequence. It splits each
+                // 64-bit lane into 32-bit high and low sections using shifting and then performs
+                // the following arithmetic per lane: with arg0 = concat(high0, low0) and arg1 =
+                // concat(high1, low1), calculate (high0 * low1) + (high1 * low0) + (low0 * low1).
+                let high0 = pos.ins().ushr_imm(arg0, 32);
+                let mul0 = pos.ins().x86_pmuludq(high0, arg1);
+                let high1 = pos.ins().ushr_imm(arg1, 32);
+                let mul1 = pos.ins().x86_pmuludq(high1, arg0);
+                let addhigh = pos.ins().iadd(mul0, mul1);
+                let high = pos.ins().ishl_imm(addhigh, 32);
+                let low = pos.ins().x86_pmuludq(arg0, arg1);
+                pos.func.dfg.replace(inst).iadd(low, high);
+            }
+        } else {
+            unreachable!(
+                "{} should be encodable; it cannot be legalized by convert_i64x2_imul",
+                pos.func.dfg.display_inst(inst, None)
+            );
+        }
+    }
+}
+
 fn expand_tls_value(
    inst: ir::Inst,
    func: &mut ir::Function,
--- a/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs
@ -23,6 +23,7 @@ use crate::result::CodegenResult;
 use crate::timing;
 use alloc::borrow::Cow;
 use alloc::boxed::Box;
+use core::any::Any;
 use core::fmt;
 use target_lexicon::{PointerWidth, Triple};

@ -53,12 +54,23 @@ fn isa_constructor(
        PointerWidth::U32 => &enc_tables::LEVEL1_I32[..],
        PointerWidth::U64 => &enc_tables::LEVEL1_I64[..],
    };
-    Box::new(Isa {
-        triple,
-        isa_flags: settings::Flags::new(&shared_flags, builder),
-        shared_flags,
-        cpumode: level1,
-    })
+
+    let isa_flags = settings::Flags::new(&shared_flags, builder);
+
+    if isa_flags.use_new_backend() {
+        #[cfg(not(feature = "x64"))]
+        panic!("new backend x86 support not included by cargo features!");
+
+        #[cfg(feature = "x64")]
+        super::x64::isa_builder(triple).finish(shared_flags)
+    } else {
+        Box::new(Isa {
+            triple,
+            isa_flags,
+            shared_flags,
+            cpumode: level1,
+        })
+    }
 }

 impl TargetIsa for Isa {
@ -173,6 +185,10 @@ impl TargetIsa for Isa {
    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
        Some(unwind::systemv::create_cie())
    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
 }

 impl fmt::Display for Isa {
--- a/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs
@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
    let mut prologue_size = 0;
    let mut unwind_codes = Vec::new();
    let mut found_end = false;
-
-    // Have we saved at least one FPR? if so, we might have to check additional constraints.
-    let mut saved_fpr = false;
-
-    // In addition to the min offset for a callee-save, we need to know the offset from the
-    // frame base to the stack pointer, so that we can record an unwind offset that spans only
-    // to the end of callee-save space.
-    let mut static_frame_allocation_size = 0u32;
-
-    // For the time being, FPR preservation is split into a stack_addr and later store/load.
-    // Store the register used for stack store and ensure it is the same register with no
-    // intervening changes to the frame size.
-    let mut callee_save_region_reg = None;
-    // Also record the callee-save region's offset from RSP, because it must be added to FPR
-    // save offsets to compute an offset from the frame base.
-    let mut callee_save_offset = None;
+    let mut xmm_save_count: u8 = 0;

    for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
        // x64 ABI prologues cannot exceed 255 bytes in length
@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
            InstructionData::Unary { opcode, arg } => {
                match opcode {
                    Opcode::X86Push => {
-                        static_frame_allocation_size += 8;
-
                        unwind_codes.push(UnwindCode::PushRegister {
                            offset: unwind_offset,
                            reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
                    Opcode::AdjustSpDown => {
                        let stack_size =
                            stack_size.expect("expected a previous stack size instruction");
-                        static_frame_allocation_size += stack_size;

                        // This is used when calling a stack check function
                        // We need to track the assignment to RAX which has the size of the stack
@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
            InstructionData::CopySpecial { src, dst, .. } => {
                if let Some(frame_register) = frame_register {
                    if src == (RU::rsp as RegUnit) && dst == frame_register {
-                        // Constructing an rbp-based stack frame, so the static frame
-                        // allocation restarts at 0 from here.
-                        static_frame_allocation_size = 0;
-
                        unwind_codes.push(UnwindCode::SetFramePointer {
                            offset: unwind_offset,
                            sp_offset: 0,
@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
                        let imm: i64 = imm.into();
                        assert!(imm <= core::u32::MAX as i64);

-                        static_frame_allocation_size += imm as u32;
+                        stack_size = Some(imm as u32);

                        unwind_codes.push(UnwindCode::StackAlloc {
                            offset: unwind_offset,
@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
                    _ => {}
                }
            }
-            InstructionData::StackLoad {
-                opcode: Opcode::StackAddr,
-                stack_slot,
-                offset: _,
-            } => {
-                let result = func.dfg.inst_results(inst).get(0).unwrap();
-                if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
-                    callee_save_region_reg = Some(frame_reg);
-
-                    // Figure out the offset in the call frame that `frame_reg` will have.
-                    let frame_size = func
-                        .stack_slots
-                        .layout_info
-                        .expect("func's stack slots have layout info if stack operations exist")
-                        .frame_size;
-                    // Because we're well after the prologue has been constructed, stack slots
-                    // must have been laid out...
-                    let slot_offset = func.stack_slots[stack_slot]
-                        .offset
-                        .expect("callee-save slot has an offset computed");
-                    let frame_offset = frame_size as i32 + slot_offset;
-
-                    callee_save_offset = Some(frame_offset as u32);
-                }
-            }
            InstructionData::Store {
                opcode: Opcode::Store,
                args: [arg1, arg2],
-                flags: _flags,
                offset,
+                ..
            } => {
-                if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
+                if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
                    (func.locations[arg1], func.locations[arg2])
                {
-                    if Some(base_ru) == callee_save_region_reg {
-                        let offset_int: i32 = offset.into();
-                        assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
-                        let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
-                        if FPR.contains(ru) {
-                            saved_fpr = true;
-                            unwind_codes.push(UnwindCode::SaveXmm {
-                                offset: unwind_offset,
-                                reg: ru as u8,
-                                stack_offset: offset_int,
-                            });
-                        }
+                    // If this is a save of an FPR, record an unwind operation
+                    // Note: the stack_offset here is relative to an adjusted SP
+                    // This will be fixed up later to be based on the frame pointer offset
+                    if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
+                        let offset: i32 = offset.into();
+                        unwind_codes.push(UnwindCode::SaveXmm {
+                            offset: unwind_offset,
+                            reg: src as u8,
+                            stack_offset: offset as u32,
+                        });
+
+                        xmm_save_count += 1;
                    }
                }
            }
@ -183,41 +136,45 @@ pub(crate) fn create_unwind_info(

    assert!(found_end);

-    if saved_fpr {
-        if static_frame_allocation_size > 240 && saved_fpr {
-            warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
-                This is a Cranelift implementation limit, see \
-                https://github.com/bytecodealliance/wasmtime/issues/1475",
-                static_frame_allocation_size);
-            return Err(CodegenError::ImplLimitExceeded);
+    // When using a frame register, certain unwind operations, such as XMM saves, are relative to the frame
+    // register minus some offset, forming a "base address". This attempts to calculate the frame register offset
+    // while updating the XMM save offsets to be relative from this "base address" rather than RSP.
+    let mut frame_register_offset = 0;
+    if frame_register.is_some() && xmm_save_count > 0 {
+        // Determine the number of 16-byte slots used for all CSRs (including GPRs)
+        // The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
+        // Assumption: each FPR is stored at a lower address than the previous one
+        let mut last_stack_offset = None;
+        let mut fpr_save_count: u8 = 0;
+        let mut gpr_push_count: u8 = 0;
+        for code in unwind_codes.iter_mut() {
+            match code {
+                UnwindCode::SaveXmm { stack_offset, .. } => {
+                    if let Some(last) = last_stack_offset {
+                        assert!(last > *stack_offset);
+                    }
+                    last_stack_offset = Some(*stack_offset);
+                    fpr_save_count += 1;
+                    *stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
+                }
+                UnwindCode::PushRegister { .. } => {
+                    gpr_push_count += 1;
+                }
+                _ => {}
+            }
        }
-        // Only test static frame size is 16-byte aligned when an FPR is saved to avoid
-        // panicking when alignment is elided because no FPRs are saved and no child calls are
-        // made.
-        assert!(
-            static_frame_allocation_size % 16 == 0,
-            "static frame allocation must be a multiple of 16"
-        );
-    }
+        assert_eq!(fpr_save_count, xmm_save_count);

-    // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
-    // one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
-    // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
-    // frames are inexpressible, and we cannot actually compile the function. In case there are
-    // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
-    // will actually check it. This, then, avoids panics when compiling functions with large
-    // call frames.
-    let reported_frame_offset = if saved_fpr {
-        (static_frame_allocation_size / 16) as u8
-    } else {
-        0
-    };
+        // Account for alignment space when there's an odd number of GPR pushes
+        // Assumption: an FPR (16 bytes) is twice the size of a GPR (8 bytes), hence the (rounded-up) integer division
+        frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
+    }

    Ok(Some(UnwindInfo {
        flags: 0, // this assumes cranelift functions have no SEH handlers
        prologue_size: prologue_size as u8,
        frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
-        frame_register_offset: reported_frame_offset,
+        frame_register_offset,
        unwind_codes,
    }))
 }
@ -284,7 +241,7 @@ mod tests {
                    },
                    UnwindCode::StackAlloc {
                        offset: 9,
-                        size: 64 + 32
+                        size: 64
                    }
                ]
            }
@ -303,7 +260,7 @@ mod tests {
                0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
                0x05, // Frame register + offset (RBP with 0 offset)
                0x09, // Prolog offset
-                0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
+                0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
                0x05, // Prolog offset
                0x03, // Operation 3 (save frame register), stack pointer offset = 0
                0x02, // Prolog offset
@ -349,7 +306,7 @@ mod tests {
                    },
                    UnwindCode::StackAlloc {
                        offset: 27,
-                        size: 10000 + 32
+                        size: 10000
                    }
                ]
            }
@ -369,8 +326,8 @@ mod tests {
                0x05, // Frame register + offset (RBP with 0 offset)
                0x1B, // Prolog offset
                0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
-                0xE6, // Low size byte
-                0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
+                0xE2, // Low size byte
+                0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
                0x05, // Prolog offset
                0x03, // Operation 3 (save frame register), stack pointer offset = 0
                0x02, // Prolog offset
@ -414,7 +371,7 @@ mod tests {
                    },
                    UnwindCode::StackAlloc {
                        offset: 27,
-                        size: 1000000 + 32
+                        size: 1000000
                    }
                ]
            }
@ -434,10 +391,10 @@ mod tests {
                0x05, // Frame register + offset (RBP with 0 offset)
                0x1B, // Prolog offset
                0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
-                0x60, // Byte 1 of size
+                0x40, // Byte 1 of size
                0x42, // Byte 2 of size
                0x0F, // Byte 3 of size
-                0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
+                0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
                0x05, // Prolog offset
                0x03, // Operation 3 (save frame register), stack pointer offset = 0
                0x02, // Prolog offset
--- a/third_party/rust/cranelift-codegen/src/legalizer/boundary.rs
+++ b/third_party/rust/cranelift-codegen/src/legalizer/boundary.rs
@ -504,6 +504,13 @@ where
            // this value.
            pos.ins().with_results([into_result]).ireduce(ty, arg)
        }
+        // ABI argument is a pointer to the value we want.
+        ValueConversion::Pointer(abi_ty) => {
+            let arg = convert_from_abi(pos, abi_ty, None, get_arg);
+            pos.ins()
+                .with_results([into_result])
+                .load(ty, MemFlags::new(), arg, 0)
+        }
    }
 }

@ -563,6 +570,18 @@ fn convert_to_abi<PutArg>(
            let arg = pos.ins().uextend(abi_ty, value);
            convert_to_abi(pos, cfg, arg, put_arg);
        }
+        ValueConversion::Pointer(abi_ty) => {
+            // Note: This conversion can only happen for call arguments,
+            // so we can allocate the value on stack safely.
+            let stack_slot = pos.func.create_stack_slot(StackSlotData {
+                kind: StackSlotKind::ExplicitSlot,
+                size: ty.bytes(),
+                offset: None,
+            });
+            let arg = pos.ins().stack_addr(abi_ty, stack_slot, 0);
+            pos.ins().store(MemFlags::new(), value, arg, 0);
+            convert_to_abi(pos, cfg, arg, put_arg);
+        }
    }
 }

@ -757,12 +776,6 @@ pub fn handle_call_abi(
    {
        legalize_sret_call(isa, pos, sig_ref, inst);
    } else {
-        // OK, we need to fix the call arguments to match the ABI signature.
-        let abi_args = pos.func.dfg.signatures[sig_ref].params.len();
-        legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
-            func.dfg.signatures[sig_ref].params[abi_arg]
-        });
-
        if !pos.func.dfg.signatures[sig_ref].returns.is_empty() {
            inst = legalize_inst_results(pos, |func, abi_res| {
                func.dfg.signatures[sig_ref].returns[abi_res]
@ -770,6 +783,13 @@ pub fn handle_call_abi(
        }
    }

+    // Go back and fix the call arguments to match the ABI signature.
+    pos.goto_inst(inst);
+    let abi_args = pos.func.dfg.signatures[sig_ref].params.len();
+    legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
+        func.dfg.signatures[sig_ref].params[abi_arg]
+    });
+
    debug_assert!(
        check_call_signature(&pos.func.dfg, inst).is_ok(),
        "Signature still wrong: {}, {}{}",
@ -814,7 +834,12 @@ pub fn handle_return_abi(inst: Inst, func: &mut Function, cfg: &ControlFlowGraph
    pos.use_srcloc(inst);

    legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
-        func.signature.returns[abi_arg]
+        let arg = func.signature.returns[abi_arg];
+        debug_assert!(
+            !arg.legalized_to_pointer,
+            "Return value cannot be legalized to pointer"
+        );
+        arg
    });
    // Append special return arguments for any `sret`, `link`, and `vmctx` return values added to
    // the legalized signature. These values should simply be propagated from the entry block
--- a/third_party/rust/cranelift-codegen/src/legalizer/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/legalizer/mod.rs
@ -35,7 +35,7 @@ mod table;
 use self::call::expand_call;
 use self::globalvalue::expand_global_value;
 use self::heap::expand_heap_addr;
-use self::libcall::expand_as_libcall;
+pub(crate) use self::libcall::expand_as_libcall;
 use self::table::expand_table_addr;

 enum LegalizeInstResult {
--- a/third_party/rust/cranelift-codegen/src/lib.rs
+++ b/third_party/rust/cranelift-codegen/src/lib.rs
@ -99,12 +99,12 @@ mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
-mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
 mod redundant_reload_remover;
 mod regalloc;
+mod remove_constant_phis;
 mod result;
 mod scoped_hash_map;
 mod simple_gvn;
@ -114,6 +114,9 @@ mod topo_order;
 mod unreachable_code;
 mod value_label;

+#[cfg(feature = "enable-peepmatic")]
+mod peepmatic;
+
 pub use crate::result::{CodegenError, CodegenResult};

 /// Version number of this crate.
--- a/third_party/rust/cranelift-codegen/src/machinst/abi.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/abi.rs
@ -12,6 +12,15 @@ pub trait ABIBody {
    /// The instruction type for the ISA associated with this ABI.
    type I: VCodeInst;

+    /// Does the ABI-body code need a temp reg? One will be provided to `init()`
+    /// as the `maybe_tmp` arg if so.
+    fn temp_needed(&self) -> bool;
+
+    /// Initialize. This is called after the ABIBody is constructed because it
+    /// may be provided with a temp vreg, which can only be allocated once the
+    /// lowering context exists.
+    fn init(&mut self, maybe_tmp: Option<Writable<Reg>>);
+
    /// Get the settings controlling this function's compilation.
    fn flags(&self) -> &settings::Flags;

@ -34,6 +43,13 @@ pub trait ABIBody {
    /// register.
    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;

+    /// Generate any setup instruction needed to save values to the
+    /// return-value area. This is usually used when were are multiple return
+    /// values or an otherwise large return value that must be passed on the
+    /// stack; typically the ABI specifies an extra hidden argument that is a
+    /// pointer to that memory.
+    fn gen_retval_area_setup(&self) -> Option<Self::I>;
+
    /// Generate an instruction which copies a source register to a return value slot.
    fn gen_copy_reg_to_retval(
        &self,
@ -98,7 +114,10 @@ pub trait ABIBody {
    fn gen_epilogue(&self) -> Vec<Self::I>;

    /// Returns the full frame size for the given function, after prologue emission has run. This
-    /// comprises the spill space, incoming argument space, alignment padding, etc.
+    /// comprises the spill slots and stack-storage slots (but not storage for clobbered callee-save
+    /// registers, arguments pushed at callsites within this function, or other ephemeral pushes).
+    /// This is used for ABI variants where the client generates prologue/epilogue code, as in
+    /// Baldrdash (SpiderMonkey integration).
    fn frame_size(&self) -> u32;

    /// Get the spill-slot size.
@ -132,24 +151,29 @@ pub trait ABICall {
    /// Get the number of arguments expected.
    fn num_args(&self) -> usize;

-    /// Copy an argument value from a source register, prior to the call.
-    fn gen_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+    /// Emit a copy of an argument value from a source register, prior to the call.
+    fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
        &self,
        ctx: &mut C,
        idx: usize,
        from_reg: Reg,
-    ) -> Vec<Self::I>;
+    );

-    /// Copy a return value into a destination register, after the call returns.
-    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
+    /// Emit a copy a return value into a destination register, after the call returns.
+    fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>(
+        &self,
+        ctx: &mut C,
+        idx: usize,
+        into_reg: Writable<Reg>,
+    );

-    /// Pre-adjust the stack, prior to argument copies and call.
-    fn gen_stack_pre_adjust(&self) -> Vec<Self::I>;
+    /// Emit code to pre-adjust the stack, prior to argument copies and call.
+    fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);

-    /// Post-adjust the satck, after call return and return-value copies.
-    fn gen_stack_post_adjust(&self) -> Vec<Self::I>;
+    /// Emit code to post-adjust the satck, after call return and return-value copies.
+    fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);

-    /// Generate the call itself.
+    /// Emit the call itself.
    ///
    /// The returned instruction should have proper use- and def-sets according
    /// to the argument registers, return-value registers, and clobbered
@ -159,5 +183,8 @@ pub trait ABICall {
    /// registers are also logically defs, but should never be read; their
    /// values are "defined" (to the regalloc) but "undefined" in every other
    /// sense.)
-    fn gen_call(&self) -> Vec<Self::I>;
+    ///
+    /// This function should only be called once, as it is allowed to re-use
+    /// parts of the ABICall object in emitting instructions.
+    fn emit_call<C: LowerCtx<I = Self::I>>(&mut self, ctx: &mut C);
 }
--- a/third_party/rust/cranelift-codegen/src/machinst/adapter.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/adapter.rs
@ -10,6 +10,7 @@ use crate::settings::Flags;
 #[cfg(feature = "testing_hooks")]
 use crate::regalloc::RegDiversions;

+use core::any::Any;
 use std::borrow::Cow;
 use std::fmt;
 use target_lexicon::Triple;
@ -127,4 +128,8 @@ impl TargetIsa for TargetIsaAdapter {
    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
        self.backend.unsigned_sub_overflow_condition()
    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
 }
--- a/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs
@ -1,59 +1,624 @@
 //! Computation of basic block order in emitted code.
+//!
+//! This module handles the translation from CLIF BBs to VCode BBs.
+//!
+//! The basic idea is that we compute a sequence of "lowered blocks" that
+//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
+//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
+//! moves for phi-nodes on every block-to-block transfer; these blocks always
+//! conceptually exist, but may be merged with an "original" CLIF block (and
+//! hence not actually exist; this is equivalent to inserting the blocks only on
+//! critical edges).
+//!
+//! In other words, starting from a CFG like this (where each "CLIF block" and
+//! "(edge N->M)" is a separate basic block):
+//!
+//! ```plain
+//!
+//!              CLIF block 0
+//!               /           \
+//!       (edge 0->1)         (edge 0->2)
+//!              |                |
+//!       CLIF block 1         CLIF block 2
+//!              \                /
+//!           (edge 1->3)   (edge 2->3)
+//!                   \      /
+//!                 CLIF block 3
+//! ```
+//!
+//! We can produce a CFG of lowered blocks like so:
+//!
+//! ```plain
+//!            +--------------+
+//!            | CLIF block 0 |
+//!            +--------------+
+//!               /           \
+//!     +--------------+     +--------------+
+//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | CLIF block 1 |     | CLIF block 2 |
+//!     +--------------+     +--------------+
+//!              \                /
+//!          +-----------+ +-----------+
+//!          |(edge 1->3)| |(edge 2->3)|
+//!          +-----------+ +-----------+
+//!                   \      /
+//!                +------------+
+//!                |CLIF block 3|
+//!                +------------+
+//! ```
+//!
+//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
+//! blocks' original bodies, but the out-edges could not because for simplicity
+//! in the successor-function definition, we only ever merge an edge onto one
+//! side of an original CLIF block.)
+//!
+//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
+//! block prepended or appended with an edge block (never both, though), or just
+//! an edge block.
+//!
+//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
+//! (never actually materialized, just defined by a "successors" function), and
+//! compute the reverse postorder.
+//!
+//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for
+//! example, consider any information about whether edge blocks will actually
+//! have content, because this computation happens as part of lowering *before*
+//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
+//! particular edge. But it works relatively well and is conceptually simple.
+//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
+//! branch editing that in practice elides empty blocks and simplifies some of
+//! the other redundancies that this scheme produces.

+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::ir::{Block, Function, Inst, Opcode};
+use crate::machinst::lower::visit_block_succs;
 use crate::machinst::*;
-use regalloc::{BlockIx, Function};

-/// Simple reverse postorder-based block order emission.
-///
-/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
-/// construction algorithm.
-struct BlockRPO {
-    visited: Vec<bool>,
-    postorder: Vec<BlockIndex>,
-    deferred_last: Option<BlockIndex>,
+use log::debug;
+use smallvec::SmallVec;
+
+/// Mapping from CLIF BBs to VCode BBs.
+#[derive(Debug)]
+pub struct BlockLoweringOrder {
+    /// Lowered blocks, in BlockIndex order. Each block is some combination of
+    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
+    /// see [LoweredBlock] for details.
+    lowered_order: Vec<LoweredBlock>,
+    /// Successors for all lowered blocks, in one serialized vector. Indexed by
+    /// the ranges in `lowered_succ_ranges`.
+    lowered_succs: Vec<(Inst, LoweredBlock)>,
+    /// BlockIndex values for successors for all lowered blocks, in the same
+    /// order as `lowered_succs`.
+    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
+    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// block. Indexed by lowering-order index (`BlockIndex`).
+    lowered_succ_ranges: Vec<(usize, usize)>,
+    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
+    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
+    /// blocks.
+    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
 }

-impl BlockRPO {
-    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
-        BlockRPO {
-            visited: vec![false; vcode.num_blocks()],
-            postorder: vec![],
-            deferred_last: None,
+/// The origin of a block in the lowered block-order: either an original CLIF
+/// block, or an inserted edge-block, or a combination of the two if an edge is
+/// non-critical.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum LoweredBlock {
+    /// Block in original CLIF, with no merged edge-blocks.
+    Orig {
+        /// Original CLIF block.
+        block: Block,
+    },
+    /// Block in the original CLIF, plus edge-block to one succ (which is the
+    /// one successor of the original block).
+    OrigAndEdge {
+        /// The original CLIF block contained in this lowered block.
+        block: Block,
+        /// The edge (jump) instruction transitioning from this block
+        /// to the next, i.e., corresponding to the included edge-block. This
+        /// will be an instruction in `block`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+    /// Block in the original CLIF, preceded by edge-block from one pred (which
+    /// is the one pred of the original block).
+    EdgeAndOrig {
+        /// The previous CLIF block, i.e., the edge block's predecessor.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to the included
+        /// edge-block. This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The original CLIF block included in this lowered block.
+        block: Block,
+    },
+    /// Split critical edge between two CLIF blocks. This lowered block does not
+    /// correspond to any original CLIF blocks; it only serves as an insertion
+    /// point for work to happen on the transition from `pred` to `succ`.
+    Edge {
+        /// The predecessor CLIF block.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to this edge's transition.
+        /// This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+}
+
+impl LoweredBlock {
+    /// The associated original (CLIF) block included in this lowered block, if
+    /// any.
+    pub fn orig_block(self) -> Option<Block> {
+        match self {
+            LoweredBlock::Orig { block, .. }
+            | LoweredBlock::OrigAndEdge { block, .. }
+            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
+            LoweredBlock::Edge { .. } => None,
        }
    }

-    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
-        self.visited[block as usize] = true;
-        for succ in vcode.succs(block) {
-            if !self.visited[*succ as usize] {
-                self.visit(vcode, *succ);
+    /// The associated in-edge, if any.
+    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::EdgeAndOrig {
+                pred,
+                edge_inst,
+                block,
+            } => Some((pred, edge_inst, block)),
+            _ => None,
+        }
+    }
+
+    /// the associated out-edge, if any. Also includes edge-only blocks.
+    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::OrigAndEdge {
+                block,
+                edge_inst,
+                succ,
+            } => Some((block, edge_inst, succ)),
+            LoweredBlock::Edge {
+                pred,
+                edge_inst,
+                succ,
+            } => Some((pred, edge_inst, succ)),
+            _ => None,
+        }
+    }
+}
+
+impl BlockLoweringOrder {
+    /// Compute and return a lowered block order for `f`.
+    pub fn new(f: &Function) -> BlockLoweringOrder {
+        debug!("BlockLoweringOrder: function body {:?}", f);
+
+        // Step 1: compute the in-edge and out-edge count of every block.
+        let mut block_in_count = SecondaryMap::with_default(0);
+        let mut block_out_count = SecondaryMap::with_default(0);
+
+        // Cache the block successors to avoid re-examining branches below.
+        let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        let mut fallthrough_return_block = None;
+        for block in f.layout.blocks() {
+            let block_succ_start = block_succs.len();
+            visit_block_succs(f, block, |inst, succ| {
+                block_out_count[block] += 1;
+                block_in_count[succ] += 1;
+                block_succs.push((inst, succ));
+            });
+            let block_succ_end = block_succs.len();
+            block_succ_range[block] = (block_succ_start, block_succ_end);
+
+            for inst in f.layout.block_likely_branches(block) {
+                if f.dfg[inst].opcode() == Opcode::Return {
+                    // Implicit output edge for any return.
+                    block_out_count[block] += 1;
+                }
+                if f.dfg[inst].opcode() == Opcode::FallthroughReturn {
+                    // Fallthrough return block must come last.
+                    debug_assert!(fallthrough_return_block == None);
+                    fallthrough_return_block = Some(block);
+                }
+            }
+        }
+        // Implicit input edge for entry block.
+        if let Some(entry) = f.layout.entry_block() {
+            block_in_count[entry] += 1;
+        }
+
+        // Here we define the implicit CLIF-plus-edges graph. There are
+        // conceptually two such graphs: the original, with every edge explicit,
+        // and the merged one, with blocks (represented by `LoweredBlock`
+        // values) that contain original CLIF blocks, edges, or both. This
+        // function returns a lowered block's successors as per the latter, with
+        // consideration to edge-block merging.
+        //
+        // Note that there is a property of the block-merging rules below
+        // that is very important to ensure we don't miss any lowered blocks:
+        // any block in the implicit CLIF-plus-edges graph will *only* be
+        // included in one block in the merged graph.
+        //
+        // This, combined with the property that every edge block is reachable
+        // only from one predecessor (and hence cannot be reached by a DFS
+        // backedge), means that it is sufficient in our DFS below to track
+        // visited-bits per original CLIF block only, not per edge. This greatly
+        // simplifies the data structures (no need to keep a sparse hash-set of
+        // (block, block) tuples).
+        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
+            let start_idx = ret.len();
+            match block {
+                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
+                    // At an orig block; successors are always edge blocks,
+                    // possibly with orig blocks following.
+                    let range = block_succ_range[block];
+                    for &(edge_inst, succ) in &block_succs[range.0..range.1] {
+                        if block_in_count[succ] == 1 {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::EdgeAndOrig {
+                                    pred: block,
+                                    edge_inst,
+                                    block: succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::Edge {
+                                    pred: block,
+                                    edge_inst,
+                                    succ,
+                                },
+                            ));
+                        }
+                    }
+                }
+                LoweredBlock::Edge {
+                    succ, edge_inst, ..
+                }
+                | LoweredBlock::OrigAndEdge {
+                    succ, edge_inst, ..
+                } => {
+                    // At an edge block; successors are always orig blocks,
+                    // possibly with edge blocks following.
+                    if block_out_count[succ] == 1 {
+                        let range = block_succ_range[succ];
+                        // check if the one succ is a real CFG edge (vs.
+                        // implicit return succ).
+                        if range.1 - range.0 > 0 {
+                            debug_assert!(range.1 - range.0 == 1);
+                            let (succ_edge_inst, succ_succ) = block_succs[range.0];
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::OrigAndEdge {
+                                    block: succ,
+                                    edge_inst: succ_edge_inst,
+                                    succ: succ_succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                        }
+                    } else {
+                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                    }
+                }
+            }
+            let end_idx = ret.len();
+            (start_idx, end_idx)
+        };
+
+        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
+        let mut lowered_succs = vec![];
+        let mut lowered_succ_indices = vec![];
+
+        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
+        // explicit stack so we don't overflow the real stack with a deep DFS.
+        #[derive(Debug)]
+        struct StackEntry {
+            this: LoweredBlock,
+            succs: (usize, usize), // range in lowered_succs
+            cur_succ: usize,       // index in lowered_succs
+        }
+
+        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
+        let mut visited = FxHashSet::default();
+        let mut postorder = vec![];
+        if let Some(entry) = f.layout.entry_block() {
+            // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way
+            // to not special-case the entry block here.
+            let block = LoweredBlock::Orig { block: entry };
+            visited.insert(block);
+            let range = compute_lowered_succs(&mut lowered_succs, block);
+            lowered_succ_indices.resize(lowered_succs.len(), 0);
+            stack.push(StackEntry {
+                this: block,
+                succs: range,
+                cur_succ: range.1,
+            });
+        }
+
+        let mut deferred_last = None;
+        while !stack.is_empty() {
+            let stack_entry = stack.last_mut().unwrap();
+            let range = stack_entry.succs;
+            if stack_entry.cur_succ == range.0 {
+                let orig_block = stack_entry.this.orig_block();
+                if orig_block.is_some() && orig_block == fallthrough_return_block {
+                    deferred_last = Some((stack_entry.this, range));
+                } else {
+                    postorder.push((stack_entry.this, range));
+                }
+                stack.pop();
+            } else {
+                // Heuristic: chase the children in reverse. This puts the first
+                // successor block first in RPO, all other things being equal,
+                // which tends to prioritize loop backedges over out-edges,
+                // putting the edge-block closer to the loop body and minimizing
+                // live-ranges in linear instruction space.
+                let next = lowered_succs[stack_entry.cur_succ - 1].1;
+                stack_entry.cur_succ -= 1;
+                if visited.contains(&next) {
+                    continue;
+                }
+                visited.insert(next);
+                let range = compute_lowered_succs(&mut lowered_succs, next);
+                lowered_succ_indices.resize(lowered_succs.len(), 0);
+                stack.push(StackEntry {
+                    this: next,
+                    succs: range,
+                    cur_succ: range.1,
+                });
            }
        }

-        for i in vcode.block_insns(BlockIx::new(block)) {
-            if vcode.get_insn(i).is_epilogue_placeholder() {
-                debug_assert!(self.deferred_last.is_none());
-                self.deferred_last = Some(block);
-                return;
+        postorder.reverse();
+        let mut rpo = postorder;
+        if let Some(d) = deferred_last {
+            rpo.push(d);
+        }
+
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        let mut lowered_order = vec![];
+        let mut lowered_succ_ranges = vec![];
+        let mut lb_to_bindex = FxHashMap::default();
+        for (block, succ_range) in rpo.into_iter() {
+            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            lowered_order.push(block);
+            lowered_succ_ranges.push(succ_range);
+        }
+
+        let lowered_succ_indices = lowered_succs
+            .iter()
+            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
+            .collect();
+
+        let mut orig_map = SecondaryMap::with_default(None);
+        for (i, lb) in lowered_order.iter().enumerate() {
+            let i = i as BlockIndex;
+            if let Some(b) = lb.orig_block() {
+                orig_map[b] = Some(i);
            }
        }

-        self.postorder.push(block);
+        let result = BlockLoweringOrder {
+            lowered_order,
+            lowered_succs,
+            lowered_succ_indices,
+            lowered_succ_ranges,
+            orig_map,
+        };
+        debug!("BlockLoweringOrder: {:?}", result);
+        result
    }

-    fn rpo(self) -> Vec<BlockIndex> {
-        let mut rpo = self.postorder;
-        rpo.reverse();
-        if let Some(block) = self.deferred_last {
-            rpo.push(block);
+    /// Get the lowered order of blocks.
+    pub fn lowered_order(&self) -> &[LoweredBlock] {
+        &self.lowered_order[..]
+    }
+
+    /// Get the successors for a lowered block, by index in `lowered_order()`'s
+    /// returned slice. Each successsor is paired with the edge-instruction
+    /// (branch) corresponding to this edge.
+    pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succs[range.0..range.1]
+    }
+
+    /// Get the successor indices for a lowered block.
+    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succ_indices[range.0..range.1]
+    }
+
+    /// Get the lowered block index containing a CLIF block, if any. (May not be
+    /// present if the original CLIF block was unreachable.)
+    pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> {
+        self.orig_map[bb]
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+        assert!(n_blocks > 0);
+
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+        let blocks = (0..n_blocks)
+            .map(|i| {
+                let bb = func.dfg.make_block();
+                assert!(bb.as_u32() == i as u32);
+                bb
+            })
+            .collect::<Vec<_>>();
+
+        let arg0 = func.dfg.append_block_param(blocks[0], I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        let mut edge = 0;
+        for i in 0..n_blocks {
+            pos.insert_block(blocks[i]);
+            let mut succs = vec![];
+            while edge < edges.len() && edges[edge].0 == i {
+                succs.push(edges[edge].1);
+                edge += 1;
+            }
+            if succs.len() == 0 {
+                pos.ins().return_(&[arg0]);
+            } else if succs.len() == 1 {
+                pos.ins().jump(blocks[succs[0]], &[]);
+            } else if succs.len() == 2 {
+                pos.ins().brnz(arg0, blocks[succs[0]], &[]);
+                pos.ins().jump(blocks[succs[1]], &[]);
+            } else {
+                panic!("Too many successors");
+            }
        }
-        rpo
+
+        func
+    }
+
+    #[test]
+    fn test_blockorder_diamond() {
+        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 6);
+
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+
+        assert!(order.lowered_order[2].orig_block().is_none());
+        assert!(order.lowered_order[2].in_edge().is_none());
+        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[3].out_edge().is_none());
+
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[5].in_edge().is_none());
+        assert!(order.lowered_order[5].out_edge().is_none());
+    }
+
+    #[test]
+    fn test_blockorder_critedge() {
+        //            0
+        //          /   \
+        //         1     2
+        //        /  \     \
+        //       3    4    |
+        //       |\  _|____|
+        //       | \/ |
+        //       | /\ |
+        //       5    6
+        //
+        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        //
+        let func = build_test_func(
+            7,
+            &[
+                (0, 1),
+                (0, 2),
+                (1, 3),
+                (1, 4),
+                (2, 5),
+                (3, 5),
+                (3, 6),
+                (4, 6),
+            ],
+        );
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 11);
+        println!("ordered = {:?}", order.lowered_order);
+
+        // block 0
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        // edge 0->1 + block 1
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        assert!(order.lowered_order[1].out_edge().is_none());
+
+        // edge 1->3 + block 3
+        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        assert!(order.lowered_order[2].out_edge().is_none());
+
+        // edge 3->5
+        assert!(order.lowered_order[3].orig_block().is_none());
+        assert!(order.lowered_order[3].in_edge().is_none());
+        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+
+        // edge 3->6
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+
+        // edge 1->4 + block 4
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
+        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
+        assert!(order.lowered_order[5].out_edge().is_none());
+
+        // edge 4->6
+        assert!(order.lowered_order[6].orig_block().is_none());
+        assert!(order.lowered_order[6].in_edge().is_none());
+        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
+        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+
+        // block 6
+        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert!(order.lowered_order[7].in_edge().is_none());
+        assert!(order.lowered_order[7].out_edge().is_none());
+
+        // edge 0->2 + block 2
+        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[8].out_edge().is_none());
+
+        // edge 2->5
+        assert!(order.lowered_order[9].orig_block().is_none());
+        assert!(order.lowered_order[9].in_edge().is_none());
+        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
+
+        // block 5
+        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
+        assert!(order.lowered_order[10].in_edge().is_none());
+        assert!(order.lowered_order[10].out_edge().is_none());
    }
 }
-
-/// Compute the final block order.
-pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
-    let mut rpo = BlockRPO::new(vcode);
-    rpo.visit(vcode, vcode.entry());
-    rpo.rpo()
-}
--- a/third_party/rust/cranelift-codegen/src/machinst/buffer.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/buffer.rs
--- a/third_party/rust/cranelift-codegen/src/machinst/compile.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/compile.rs
@ -6,11 +6,11 @@ use crate::settings;
 use crate::timing;

 use log::debug;
-use regalloc::{allocate_registers, RegAllocAlgorithm};
+use regalloc::{allocate_registers_with_opts, Algorithm, Options};

 /// Compile the given function down to VCode with allocated registers, ready
 /// for binary emission.
-pub fn compile<B: LowerBackend>(
+pub fn compile<B: LowerBackend + MachBackend>(
    f: &Function,
    b: &B,
    abi: Box<dyn ABIBody<I = B::MInst>>,
@ -18,29 +18,46 @@ pub fn compile<B: LowerBackend>(
 where
    B::MInst: ShowWithRRU,
 {
-    // This lowers the CL IR.
-    let mut vcode = Lower::new(f, abi)?.lower(b)?;
+    // Compute lowered block order.
+    let block_order = BlockLoweringOrder::new(f);
+    // Build the lowering context.
+    let lower = Lower::new(f, abi, block_order)?;
+    // Lower the IR.
+    let mut vcode = lower.lower(b)?;

-    let universe = &B::MInst::reg_universe(vcode.flags());
-
-    debug!("vcode from lowering: \n{}", vcode.show_rru(Some(universe)));
+    debug!(
+        "vcode from lowering: \n{}",
+        vcode.show_rru(Some(b.reg_universe()))
+    );

    // Perform register allocation.
-    let algorithm = match vcode.flags().regalloc() {
-        settings::Regalloc::Backtracking => RegAllocAlgorithm::Backtracking,
-        settings::Regalloc::BacktrackingChecked => RegAllocAlgorithm::BacktrackingChecked,
-        settings::Regalloc::ExperimentalLinearScan => RegAllocAlgorithm::LinearScan,
+    let (run_checker, algorithm) = match vcode.flags().regalloc() {
+        settings::Regalloc::Backtracking => (false, Algorithm::Backtracking(Default::default())),
+        settings::Regalloc::BacktrackingChecked => {
+            (true, Algorithm::Backtracking(Default::default()))
+        }
+        settings::Regalloc::ExperimentalLinearScan => {
+            (false, Algorithm::LinearScan(Default::default()))
+        }
+        settings::Regalloc::ExperimentalLinearScanChecked => {
+            (true, Algorithm::LinearScan(Default::default()))
+        }
    };

    let result = {
        let _tt = timing::regalloc();
-        allocate_registers(
-            &mut vcode, algorithm, universe, /*request_block_annotations=*/ false,
+        allocate_registers_with_opts(
+            &mut vcode,
+            b.reg_universe(),
+            Options {
+                run_checker,
+                algorithm,
+            },
        )
        .map_err(|err| {
            debug!(
                "Register allocation error for vcode\n{}\nError: {:?}",
-                vcode.show_rru(Some(universe)),
+                vcode.show_rru(Some(b.reg_universe())),
                err
            );
            err
@ -52,14 +69,9 @@ where
    // all at once. This also inserts prologues/epilogues.
    vcode.replace_insns_from_regalloc(result);

-    vcode.remove_redundant_branches();
-
-    // Do final passes over code to finalize branches.
-    vcode.finalize_branches();
-
    debug!(
        "vcode after regalloc: final version:\n{}",
-        vcode.show_rru(Some(universe))
+        vcode.show_rru(Some(b.reg_universe()))
    );

    Ok(vcode)
--- a/third_party/rust/cranelift-codegen/src/machinst/lower.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/lower.rs
--- a/third_party/rust/cranelift-codegen/src/machinst/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/mod.rs
@ -109,6 +109,7 @@ use regalloc::RegUsageCollector;
 use regalloc::{
    RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
+use smallvec::SmallVec;
 use std::string::String;
 use target_lexicon::Triple;

@ -124,8 +125,8 @@ pub mod abi;
 pub use abi::*;
 pub mod pretty_print;
 pub use pretty_print::*;
-pub mod sections;
-pub use sections::*;
+pub mod buffer;
+pub use buffer::*;
 pub mod adapter;
 pub use adapter::*;

@ -137,7 +138,7 @@ pub trait MachInst: Clone + Debug {

    /// Map virtual registers to physical registers using the given virt->phys
    /// maps corresponding to the program points prior to, and after, this instruction.
-    fn map_regs(&mut self, maps: &RegUsageMapper);
+    fn map_regs<RUM: RegUsageMapper>(&mut self, maps: &RUM);

    /// If this is a simple move, return the (source, destination) tuple of registers.
    fn is_move(&self) -> Option<(Writable<Reg>, Reg)>;
@ -152,6 +153,9 @@ pub trait MachInst: Clone + Debug {
    /// Generate a move.
    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;

+    /// Generate a constant into a reg.
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Self; 4]>;
+
    /// Generate a zero-length no-op.
    fn gen_zero_len_nop() -> Self;

@ -166,7 +170,7 @@ pub trait MachInst: Clone + Debug {

    /// Generate a jump to another target. Used during lowering of
    /// control flow.
-    fn gen_jump(target: BlockIndex) -> Self;
+    fn gen_jump(target: MachLabel) -> Self;

    /// Generate a NOP. The `preferred_size` parameter allows the caller to
    /// request a NOP of that size, or as close to it as possible. The machine
@ -175,17 +179,6 @@ pub trait MachInst: Clone + Debug {
    /// the instruction must have a nonzero size.
    fn gen_nop(preferred_size: usize) -> Self;

-    /// Rewrite block targets using the block-target map.
-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
-
-    /// Finalize branches once the block order (fallthrough) is known.
-    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
-
-    /// Update instruction once block offsets are known.  These offsets are
-    /// relative to the beginning of the function. `targets` is indexed by
-    /// BlockIndex.
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
-
    /// Get the register universe for this backend.
    fn reg_universe(flags: &Flags) -> RealRegUniverse;

@ -194,6 +187,54 @@ pub trait MachInst: Clone + Debug {
    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
        offset
    }
+
+    /// What is the worst-case instruction size emitted by this instruction type?
+    fn worst_case_size() -> CodeOffset;
+
+    /// A label-use kind: a type that describes the types of label references that
+    /// can occur in an instruction.
+    type LabelUse: MachInstLabelUse;
+}
+
+/// A descriptor of a label reference (use) in an instruction set.
+pub trait MachInstLabelUse: Clone + Copy + Debug + Eq {
+    /// Required alignment for any veneer. Usually the required instruction
+    /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86).
+    const ALIGN: CodeOffset;
+
+    /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a
+    /// label-reference fixup at offset `x` is valid if the label resolves to `x
+    /// + 1024`.
+    fn max_pos_range(self) -> CodeOffset;
+    /// What is the maximum PC-relative range (negative)? This is the absolute
+    /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is
+    /// valid if the label resolves to `x - 1024`.
+    fn max_neg_range(self) -> CodeOffset;
+    /// What is the size of code-buffer slice this label-use needs to patch in
+    /// the label's value?
+    fn patch_size(self) -> CodeOffset;
+    /// Perform a code-patch, given the offset into the buffer of this label use
+    /// and the offset into the buffer of the label's definition.
+    /// It is guaranteed that, given `delta = offset - label_offset`, we will
+    /// have `offset >= -self.max_neg_range()` and `offset <=
+    /// self.max_pos_range()`.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset);
+    /// Can the label-use be patched to a veneer that supports a longer range?
+    /// Usually valid for jumps (a short-range jump can jump to a longer-range
+    /// jump), but not for e.g. constant pool references, because the constant
+    /// load would require different code (one more level of indirection).
+    fn supports_veneer(self) -> bool;
+    /// How many bytes are needed for a veneer?
+    fn veneer_size(self) -> CodeOffset;
+    /// Generate a veneer. The given code-buffer slice is `self.veneer_size()`
+    /// bytes long at offset `veneer_offset` in the buffer. The original
+    /// label-use will be patched to refer to this veneer's offset.  A new
+    /// (offset, LabelUse) is returned that allows the veneer to use the actual
+    /// label. For veneers to work properly, it is expected that the new veneer
+    /// has a larger range; on most platforms this probably means either a
+    /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that
+    /// stage, a jump that supports a full 32-bit range, for example.
+    fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self);
 }

 /// Describes a block terminator (not call) in the vcode, when its branches
@ -205,24 +246,26 @@ pub enum MachTerminator<'a> {
    /// A return instruction.
    Ret,
    /// An unconditional branch to another block.
-    Uncond(BlockIndex),
+    Uncond(MachLabel),
    /// A conditional branch to one of two other blocks.
-    Cond(BlockIndex, BlockIndex),
+    Cond(MachLabel, MachLabel),
    /// An indirect branch with known possible targets.
-    Indirect(&'a [BlockIndex]),
+    Indirect(&'a [MachLabel]),
 }

 /// A trait describing the ability to encode a MachInst into binary machine code.
-pub trait MachInstEmit<O: MachSectionOutput> {
+pub trait MachInstEmit: MachInst {
+    /// Persistent state carried across `emit` invocations.
+    type State: Default + Clone + Debug;
    /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags);
+    fn emit(&self, code: &mut MachBuffer<Self>, flags: &Flags, state: &mut Self::State);
 }

 /// The result of a `MachBackend::compile_function()` call. Contains machine
 /// code (as bytes) and a disassembly, if requested.
 pub struct MachCompileResult {
    /// Machine code.
-    pub sections: MachSections,
+    pub buffer: MachBufferFinalized,
    /// Size of stack frame, in bytes.
    pub frame_size: u32,
    /// Disassembly, if requested.
@ -232,7 +275,7 @@ pub struct MachCompileResult {
 impl MachCompileResult {
    /// Get a `CodeInfo` describing section sizes from this compilation result.
    pub fn code_info(&self) -> CodeInfo {
-        let code_size = self.sections.total_size();
+        let code_size = self.buffer.total_size();
        CodeInfo {
            code_size,
            jumptables_size: 0,
@ -262,17 +305,13 @@ pub trait MachBackend {
    fn name(&self) -> &'static str;

    /// Return the register universe for this backend.
-    fn reg_universe(&self) -> RealRegUniverse;
+    fn reg_universe(&self) -> &RealRegUniverse;

    /// Machine-specific condcode info needed by TargetIsa.
-    fn unsigned_add_overflow_condition(&self) -> IntCC {
-        // TODO: this is what x86 specifies. Is this right for arm64?
-        IntCC::UnsignedLessThan
-    }
+    /// Condition that will be true when an IaddIfcout overflows.
+    fn unsigned_add_overflow_condition(&self) -> IntCC;

    /// Machine-specific condcode info needed by TargetIsa.
-    fn unsigned_sub_overflow_condition(&self) -> IntCC {
-        // TODO: this is what x86 specifies. Is this right for arm64?
-        IntCC::UnsignedLessThan
-    }
+    /// Condition that will be true when an IsubIfcout overflows.
+    fn unsigned_sub_overflow_condition(&self) -> IntCC;
 }
--- a/third_party/rust/cranelift-codegen/src/machinst/sections.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/sections.rs
@ -1,460 +0,0 @@
-//! In-memory representation of compiled machine code, in multiple sections
-//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
-//! simultaneously, so we buffer the result in memory and hand off to the
-//! caller at the end of compilation.
-
-use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
-use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
-
-use alloc::vec::Vec;
-
-/// A collection of sections with defined start-offsets.
-pub struct MachSections {
-    /// Sections, in offset order.
-    pub sections: Vec<MachSection>,
-}
-
-impl MachSections {
-    /// New, empty set of sections.
-    pub fn new() -> MachSections {
-        MachSections { sections: vec![] }
-    }
-
-    /// Add a section with a known offset and size. Returns the index.
-    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
-        let idx = self.sections.len();
-        self.sections.push(MachSection::new(start, length));
-        idx
-    }
-
-    /// Mutably borrow the given section by index.
-    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
-        &mut self.sections[idx]
-    }
-
-    /// Get mutable borrows of two sections simultaneously. Used during
-    /// instruction emission to provide references to the .text and .rodata
-    /// (constant pool) sections.
-    pub fn two_sections<'a>(
-        &'a mut self,
-        idx1: usize,
-        idx2: usize,
-    ) -> (&'a mut MachSection, &'a mut MachSection) {
-        assert!(idx1 < idx2);
-        assert!(idx1 < self.sections.len());
-        assert!(idx2 < self.sections.len());
-        let (first, rest) = self.sections.split_at_mut(idx2);
-        (&mut first[idx1], &mut rest[0])
-    }
-
-    /// Emit this set of sections to a set of sinks for the code,
-    /// relocations, traps, and stackmap.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        // N.B.: we emit every section into the .text section as far as
-        // the `CodeSink` is concerned; we do not bother to segregate
-        // the contents into the actual program text, the jumptable and the
-        // rodata (constant pool). This allows us to generate code assuming
-        // that these will not be relocated relative to each other, and avoids
-        // having to designate each section as belonging in one of the three
-        // fixed categories defined by `CodeSink`. If this becomes a problem
-        // later (e.g. because of memory permissions or similar), we can
-        // add this designation and segregate the output; take care, however,
-        // to add the appropriate relocations in this case.
-
-        for section in &self.sections {
-            if section.data.len() > 0 {
-                while sink.offset() < section.start_offset {
-                    sink.put1(0);
-                }
-                section.emit(sink);
-            }
-        }
-        sink.begin_jumptables();
-        sink.begin_rodata();
-        sink.end_codegen();
-    }
-
-    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
-    pub fn get_srclocs_sorted<'a>(&'a self) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs::new(&self.sections)
-    }
-
-    /// Get the total required size for these sections.
-    pub fn total_size(&self) -> CodeOffset {
-        if self.sections.len() == 0 {
-            0
-        } else {
-            // Find the last non-empty section.
-            self.sections
-                .iter()
-                .rev()
-                .find(|s| s.data.len() > 0)
-                .map(|s| s.cur_offset_from_start())
-                .unwrap_or(0)
-        }
-    }
-}
-
-/// An iterator over the srclocs in each section.
-/// Returns MachSrcLocs in an order sorted by start location.
-pub struct MachSectionsSrcLocs<'a> {
-    sections: &'a [MachSection],
-    cur_section: usize,
-    cur_srcloc: usize,
-    // For validation:
-    last_offset: CodeOffset,
-}
-
-impl<'a> MachSectionsSrcLocs<'a> {
-    fn new(sections: &'a [MachSection]) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs {
-            sections,
-            cur_section: 0,
-            cur_srcloc: 0,
-            last_offset: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for MachSectionsSrcLocs<'a> {
-    type Item = &'a MachSrcLoc;
-
-    fn next(&mut self) -> Option<&'a MachSrcLoc> {
-        // We simply iterate through sections and srcloc records in order. This produces a
-        // sorted order naturally because sections are in starting-offset-order, and srclocs
-        // are produced as a section is emitted into, so are in order as well.
-
-        // If we're out of sections, we're done.
-        if self.cur_section >= self.sections.len() {
-            return None;
-        }
-
-        // Otherwise, make sure we have a srcloc in the current section left to return, and
-        // advance to the next section if not. Done if we run out of sections.
-        while self.cur_srcloc >= self.sections[self.cur_section].srclocs.len() {
-            self.cur_srcloc = 0;
-            self.cur_section += 1;
-            if self.cur_section >= self.sections.len() {
-                return None;
-            }
-        }
-
-        let loc = &self.sections[self.cur_section].srclocs[self.cur_srcloc];
-        self.cur_srcloc += 1;
-        debug_assert!(loc.start >= self.last_offset);
-        self.last_offset = loc.start;
-        Some(loc)
-    }
-}
-
-/// An abstraction over MachSection and MachSectionSize: some
-/// receiver of section data.
-pub trait MachSectionOutput {
-    /// Get the current offset from the start of all sections.
-    fn cur_offset_from_start(&self) -> CodeOffset;
-
-    /// Get the start offset of this section.
-    fn start_offset(&self) -> CodeOffset;
-
-    /// Add 1 byte to the section.
-    fn put1(&mut self, _: u8);
-
-    /// Add 2 bytes to the section.
-    fn put2(&mut self, value: u16) {
-        let [b0, b1] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-    }
-
-    /// Add 4 bytes to the section.
-    fn put4(&mut self, value: u32) {
-        let [b0, b1, b2, b3] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-    }
-
-    /// Add 8 bytes to the section.
-    fn put8(&mut self, value: u64) {
-        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-        self.put1(b4);
-        self.put1(b5);
-        self.put1(b6);
-        self.put1(b7);
-    }
-
-    /// Add a slice of bytes to the section.
-    fn put_data(&mut self, data: &[u8]);
-
-    /// Add a relocation at the current offset.
-    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
-
-    /// Add a trap record at the current offset.
-    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
-
-    /// Add a call return address record at the current offset.
-    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
-
-    /// Start the output for the given source-location at the current offset.
-    fn start_srcloc(&mut self, loc: SourceLoc);
-
-    /// End the output for the previously-given source-location at the current offset.
-    fn end_srcloc(&mut self);
-
-    /// Align up to the given alignment.
-    fn align_to(&mut self, align_to: CodeOffset) {
-        assert!(align_to.is_power_of_two());
-        while self.cur_offset_from_start() & (align_to - 1) != 0 {
-            self.put1(0);
-        }
-    }
-}
-
-/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
-/// Multiple sections may be created with known start offsets in advance; the
-/// usual use-case is to create the .text (code) and .rodata (constant pool) at
-/// once, after computing the length of the code, so that constant references
-/// can use known offsets as instructions are emitted.
-pub struct MachSection {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The limit of this section, defined by the start of the next section.
-    pub length_limit: CodeOffset,
-    /// The section contents, as raw bytes.
-    pub data: Vec<u8>,
-    /// Any relocations referring to this section.
-    pub relocs: Vec<MachReloc>,
-    /// Any trap records referring to this section.
-    pub traps: Vec<MachTrap>,
-    /// Any call site records referring to this section.
-    pub call_sites: Vec<MachCallSite>,
-    /// Any source location mappings referring to this section.
-    pub srclocs: Vec<MachSrcLoc>,
-    /// The current source location in progress (after `start_srcloc()` and before `end_srcloc()`).
-    /// This is a (start_offset, src_loc) tuple.
-    pub cur_srcloc: Option<(CodeOffset, SourceLoc)>,
-}
-
-impl MachSection {
-    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
-    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
-        MachSection {
-            start_offset,
-            length_limit,
-            data: vec![],
-            relocs: vec![],
-            traps: vec![],
-            call_sites: vec![],
-            srclocs: vec![],
-            cur_srcloc: None,
-        }
-    }
-
-    /// Emit this section to the CodeSink and other associated sinks.  The
-    /// current offset of the CodeSink must match the starting offset of this
-    /// section.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        assert!(sink.offset() == self.start_offset);
-
-        let mut next_reloc = 0;
-        let mut next_trap = 0;
-        let mut next_call_site = 0;
-        for (idx, byte) in self.data.iter().enumerate() {
-            if next_reloc < self.relocs.len() {
-                let reloc = &self.relocs[next_reloc];
-                if reloc.offset == idx as CodeOffset {
-                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
-                    next_reloc += 1;
-                }
-            }
-            if next_trap < self.traps.len() {
-                let trap = &self.traps[next_trap];
-                if trap.offset == idx as CodeOffset {
-                    sink.trap(trap.code, trap.srcloc);
-                    next_trap += 1;
-                }
-            }
-            if next_call_site < self.call_sites.len() {
-                let call_site = &self.call_sites[next_call_site];
-                if call_site.ret_addr == idx as CodeOffset {
-                    sink.add_call_site(call_site.opcode, call_site.srcloc);
-                    next_call_site += 1;
-                }
-            }
-            sink.put1(*byte);
-        }
-    }
-}
-
-impl MachSectionOutput for MachSection {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        self.start_offset + self.data.len() as CodeOffset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, value: u8) {
-        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
-        self.data.push(value);
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
-        self.data.extend_from_slice(data);
-    }
-
-    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
-        let name = name.clone();
-        self.relocs.push(MachReloc {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            kind,
-            name,
-            addend,
-        });
-    }
-
-    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
-        self.traps.push(MachTrap {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            code,
-        });
-    }
-
-    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
-        self.call_sites.push(MachCallSite {
-            ret_addr: self.data.len() as CodeOffset,
-            srcloc,
-            opcode,
-        });
-    }
-
-    fn start_srcloc(&mut self, loc: SourceLoc) {
-        self.cur_srcloc = Some((self.cur_offset_from_start(), loc));
-    }
-
-    fn end_srcloc(&mut self) {
-        let (start, loc) = self
-            .cur_srcloc
-            .take()
-            .expect("end_srcloc() called without start_srcloc()");
-        let end = self.cur_offset_from_start();
-        // Skip zero-length extends.
-        debug_assert!(end >= start);
-        if end > start {
-            self.srclocs.push(MachSrcLoc { start, end, loc });
-        }
-    }
-}
-
-/// A MachSectionOutput implementation that records only size.
-pub struct MachSectionSize {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The current offset of this section.
-    pub offset: CodeOffset,
-}
-
-impl MachSectionSize {
-    /// Create a new size-counting dummy section.
-    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
-        MachSectionSize {
-            start_offset,
-            offset: start_offset,
-        }
-    }
-
-    /// Return the size this section would take if emitted with a real sink.
-    pub fn size(&self) -> CodeOffset {
-        self.offset - self.start_offset
-    }
-}
-
-impl MachSectionOutput for MachSectionSize {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        // All size-counting sections conceptually start at offset 0; this doesn't
-        // matter when counting code size.
-        self.offset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, _: u8) {
-        self.offset += 1;
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        self.offset += data.len() as CodeOffset;
-    }
-
-    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
-
-    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
-
-    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
-
-    fn start_srcloc(&mut self, _: SourceLoc) {}
-
-    fn end_srcloc(&mut self) {}
-}
-
-/// A relocation resulting from a compilation.
-pub struct MachReloc {
-    /// The offset at which the relocation applies, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The kind of relocation.
-    pub kind: Reloc,
-    /// The external symbol / name to which this relocation refers.
-    pub name: ExternalName,
-    /// The addend to add to the symbol value.
-    pub addend: i64,
-}
-
-/// A trap record resulting from a compilation.
-pub struct MachTrap {
-    /// The offset at which the trap instruction occurs, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The trap code.
-    pub code: TrapCode,
-}
-
-/// A call site record resulting from a compilation.
-pub struct MachCallSite {
-    /// The offset of the call's return address, *relative to the containing section*.
-    pub ret_addr: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The call's opcode.
-    pub opcode: Opcode,
-}
-
-/// A source-location mapping resulting from a compilation.
-#[derive(Clone, Debug)]
-pub struct MachSrcLoc {
-    /// The start of the region of code corresponding to a source location.
-    /// This is relative to the start of the function, not to the start of the
-    /// section.
-    pub start: CodeOffset,
-    /// The end of the region of code corresponding to a source location.
-    /// This is relative to the start of the section, not to the start of the
-    /// section.
-    pub end: CodeOffset,
-    /// The source location.
-    pub loc: SourceLoc,
-}
--- a/third_party/rust/cranelift-codegen/src/machinst/vcode.rs
+++ b/third_party/rust/cranelift-codegen/src/machinst/vcode.rs
@ -17,9 +17,7 @@
 //! See the main module comment in `mod.rs` for more details on the VCode-based
 //! backend pipeline.

-use crate::entity::SecondaryMap;
-use crate::ir;
-use crate::ir::SourceLoc;
+use crate::ir::{self, SourceLoc};
 use crate::machinst::*;
 use crate::settings;

@ -30,9 +28,7 @@ use regalloc::{
 };

 use alloc::boxed::Box;
-use alloc::vec::Vec;
-use log::debug;
-use smallvec::SmallVec;
+use alloc::{borrow::Cow, vec::Vec};
 use std::fmt;
 use std::iter;
 use std::string::String;
@ -44,8 +40,8 @@ pub type BlockIndex = u32;

 /// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
 /// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
-pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
-impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+pub trait VCodeInst: MachInst + MachInstEmit {}
+impl<I: MachInst + MachInstEmit> VCodeInst for I {}

 /// A function in "VCode" (virtualized-register code) form, after lowering.
 /// This is essentially a standard CFG of basic blocks, where each basic block
@ -79,25 +75,10 @@ pub struct VCode<I: VCodeInst> {
    /// Block successor lists, concatenated into one Vec. The `block_succ_range`
    /// list of tuples above gives (start, end) ranges within this list that
    /// correspond to each basic block's successors.
-    block_succs: Vec<BlockIndex>,
+    block_succs: Vec<BlockIx>,

-    /// Block indices by IR block.
-    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
-
-    /// IR block for each VCode Block. The length of this Vec will likely be
-    /// less than the total number of Blocks, because new Blocks (for edge
-    /// splits, for example) are appended during lowering.
-    bb_by_block: Vec<ir::Block>,
-
-    /// Order of block IDs in final generated code.
-    final_block_order: Vec<BlockIndex>,
-
-    /// Final block offsets. Computed during branch finalization and used
-    /// during emission.
-    final_block_offsets: Vec<CodeOffset>,
-
-    /// Size of code, accounting for block layout / alignment.
-    code_size: CodeOffset,
+    /// Block-order information.
+    block_order: BlockLoweringOrder,

    /// ABI object.
    abi: Box<dyn ABIBody<I = I>>,
@ -121,12 +102,8 @@ pub struct VCodeBuilder<I: VCodeInst> {
    /// In-progress VCode.
    vcode: VCode<I>,

-    /// Current basic block instructions, in reverse order (because blocks are
-    /// built bottom-to-top).
-    bb_insns: SmallVec<[(I, SourceLoc); 32]>,
-
-    /// Current IR-inst instructions, in forward order.
-    ir_inst_insns: SmallVec<[(I, SourceLoc); 4]>,
+    /// Index of the last block-start in the vcode.
+    block_start: InsnIndex,

    /// Start of succs for the current block in the concatenated succs list.
    succ_start: usize,
@ -137,12 +114,11 @@ pub struct VCodeBuilder<I: VCodeInst> {

 impl<I: VCodeInst> VCodeBuilder<I> {
    /// Create a new VCodeBuilder.
-    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
-        let vcode = VCode::new(abi);
+    pub fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi, block_order);
        VCodeBuilder {
            vcode,
-            bb_insns: SmallVec::new(),
-            ir_inst_insns: SmallVec::new(),
+            block_start: 0,
            succ_start: 0,
            cur_srcloc: SourceLoc::default(),
        }
@ -153,6 +129,11 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        &mut *self.vcode.abi
    }

+    /// Access to the BlockLoweringOrder object.
+    pub fn block_order(&self) -> &BlockLoweringOrder {
+        &self.vcode.block_order
+    }
+
    /// Set the type of a VReg.
    pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) {
        while self.vcode.vreg_types.len() <= vreg.get_index() {
@ -161,53 +142,17 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        self.vcode.vreg_types[vreg.get_index()] = ty;
    }

-    /// Return the underlying bb-to-BlockIndex map.
-    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
-        &self.vcode.block_by_bb
-    }
-
-    /// Initialize the bb-to-BlockIndex map. Returns the first free
-    /// BlockIndex.
-    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
-        let mut bindex: BlockIndex = 0;
-        for bb in blocks.iter() {
-            self.vcode.block_by_bb[*bb] = bindex;
-            self.vcode.bb_by_block.push(*bb);
-            bindex += 1;
-        }
-        bindex
-    }
-
-    /// Get the BlockIndex for an IR block.
-    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
-        self.vcode.block_by_bb[bb]
-    }
-
    /// Set the current block as the entry block.
    pub fn set_entry(&mut self, block: BlockIndex) {
        self.vcode.entry = block;
    }

-    /// End the current IR instruction. Must be called after pushing any
-    /// instructions and prior to ending the basic block.
-    pub fn end_ir_inst(&mut self) {
-        while let Some(pair) = self.ir_inst_insns.pop() {
-            self.bb_insns.push(pair);
-        }
-    }
-
    /// End the current basic block. Must be called after emitting vcode insts
    /// for IR insts and prior to ending the function (building the VCode).
-    pub fn end_bb(&mut self) -> BlockIndex {
-        assert!(self.ir_inst_insns.is_empty());
-        let block_num = self.vcode.block_ranges.len() as BlockIndex;
-        // Push the instructions.
-        let start_idx = self.vcode.insts.len() as InsnIndex;
-        while let Some((i, loc)) = self.bb_insns.pop() {
-            self.vcode.insts.push(i);
-            self.vcode.srclocs.push(loc);
-        }
+    pub fn end_bb(&mut self) {
+        let start_idx = self.block_start;
        let end_idx = self.vcode.insts.len() as InsnIndex;
+        self.block_start = end_idx;
        // Add the instruction index range to the list of blocks.
        self.vcode.block_ranges.push((start_idx, end_idx));
        // End the successors list.
@ -216,8 +161,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {
            .block_succ_range
            .push((self.succ_start, succ_end));
        self.succ_start = succ_end;
-
-        block_num
    }

    /// Push an instruction for the current BB and current IR inst within the BB.
@ -225,19 +168,27 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        match insn.is_term() {
            MachTerminator::None | MachTerminator::Ret => {}
            MachTerminator::Uncond(target) => {
-                self.vcode.block_succs.push(target);
+                self.vcode.block_succs.push(BlockIx::new(target.get()));
            }
            MachTerminator::Cond(true_branch, false_branch) => {
-                self.vcode.block_succs.push(true_branch);
-                self.vcode.block_succs.push(false_branch);
+                self.vcode.block_succs.push(BlockIx::new(true_branch.get()));
+                self.vcode
+                    .block_succs
+                    .push(BlockIx::new(false_branch.get()));
            }
            MachTerminator::Indirect(targets) => {
                for target in targets {
-                    self.vcode.block_succs.push(*target);
+                    self.vcode.block_succs.push(BlockIx::new(target.get()));
                }
            }
        }
-        self.ir_inst_insns.push((insn, self.cur_srcloc));
+        self.vcode.insts.push(insn);
+        self.vcode.srclocs.push(self.cur_srcloc);
+    }
+
+    /// Get the current source location.
+    pub fn get_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
    }

    /// Set the current source location.
@ -247,8 +198,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {

    /// Build the final VCode.
    pub fn build(self) -> VCode<I> {
-        assert!(self.ir_inst_insns.is_empty());
-        assert!(self.bb_insns.is_empty());
        self.vcode
    }
 }
@ -270,35 +219,9 @@ fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
    }
 }

-fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
-    let range = vcode.block_insns(BlockIx::new(block));
-
-    debug!(
-        "is_trivial_jump_block: block {} has len {}",
-        block,
-        range.len()
-    );
-
-    if range.len() != 1 {
-        return None;
-    }
-    let insn = range.first();
-
-    debug!(
-        " -> only insn is: {:?} with terminator {:?}",
-        vcode.get_insn(insn),
-        vcode.get_insn(insn).is_term()
-    );
-
-    match vcode.get_insn(insn).is_term() {
-        MachTerminator::Uncond(target) => Some(target),
-        _ => None,
-    }
-}
-
 impl<I: VCodeInst> VCode<I> {
    /// New empty VCode.
-    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
+    fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCode<I> {
        VCode {
            liveins: abi.liveins(),
            liveouts: abi.liveouts(),
@ -309,11 +232,7 @@ impl<I: VCodeInst> VCode<I> {
            block_ranges: vec![],
            block_succ_range: vec![],
            block_succs: vec![],
-            block_by_bb: SecondaryMap::with_default(0),
-            bb_by_block: vec![],
-            final_block_order: vec![],
-            final_block_offsets: vec![],
-            code_size: 0,
+            block_order,
            abi,
        }
    }
@ -345,7 +264,7 @@ impl<I: VCodeInst> VCode<I> {
    }

    /// Get the successors for a block.
-    pub fn succs(&self, block: BlockIndex) -> &[BlockIndex] {
+    pub fn succs(&self, block: BlockIndex) -> &[BlockIx] {
        let (start, end) = self.block_succ_range[block as usize];
        &self.block_succs[start..end]
    }
@ -354,8 +273,6 @@ impl<I: VCodeInst> VCode<I> {
    /// instructions including spliced fill/reload/move instructions, and replace
    /// the VCode with them.
    pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) {
-        self.final_block_order = compute_final_block_order(self);
-
        // Record the spillslot count and clobbered registers for the ABI/stack
        // setup code.
        self.abi.set_num_spillslots(result.num_spill_slots as usize);
@ -370,11 +287,12 @@ impl<I: VCodeInst> VCode<I> {
        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
        let mut final_srclocs = vec![];

-        for block in &self.final_block_order {
-            let (start, end) = block_ranges[*block as usize];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let (start, end) = block_ranges[block as usize];
            let final_start = final_insns.len() as InsnIndex;

-            if *block == self.entry {
+            if block == self.entry {
                // Start with the prologue.
                let prologue = self.abi.gen_prologue();
                let len = prologue.len();
@ -416,7 +334,7 @@ impl<I: VCodeInst> VCode<I> {
            }

            let final_end = final_insns.len() as InsnIndex;
-            final_block_ranges[*block as usize] = (final_start, final_end);
+            final_block_ranges[block as usize] = (final_start, final_end);
        }

        debug_assert!(final_insns.len() == final_srclocs.len());
@ -426,174 +344,68 @@ impl<I: VCodeInst> VCode<I> {
        self.block_ranges = final_block_ranges;
    }

-    /// Removes redundant branches, rewriting targets to point directly to the
-    /// ultimate block at the end of a chain of trivial one-target jumps.
-    pub fn remove_redundant_branches(&mut self) {
-        // For each block, compute the actual target block, looking through up to one
-        // block with single-target jumps (this will remove empty edge blocks inserted
-        // by phi-lowering).
-        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
-            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
-            .collect();
-        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
-
-        debug!(
-            "remove_redundant_branches: block_rewrites = {:?}",
-            block_rewrites
-        );
-
-        refcounts[self.entry as usize] = 1;
-
-        for block in 0..self.num_blocks() as u32 {
-            for insn in self.block_insns(BlockIx::new(block)) {
-                self.get_insn_mut(insn)
-                    .with_block_rewrites(&block_rewrites[..]);
-                match self.get_insn(insn).is_term() {
-                    MachTerminator::Uncond(bix) => {
-                        refcounts[bix as usize] += 1;
-                    }
-                    MachTerminator::Cond(bix1, bix2) => {
-                        refcounts[bix1 as usize] += 1;
-                        refcounts[bix2 as usize] += 1;
-                    }
-                    MachTerminator::Indirect(blocks) => {
-                        for block in blocks {
-                            refcounts[*block as usize] += 1;
-                        }
-                    }
-                    _ => {}
-                }
-            }
-        }
-
-        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
-
-        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
-        self.final_block_order = block_order
-            .into_iter()
-            .filter(|b| !deleted[*b as usize])
-            .collect();
-
-        // Rewrite successor information based on the block-rewrite map.
-        for succ in &mut self.block_succs {
-            let new_succ = block_rewrites[*succ as usize];
-            *succ = new_succ;
-        }
-    }
-
-    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
-    /// depending on fallthrough; and (ii) use concrete offsets.
-    pub fn finalize_branches(&mut self)
+    /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external
+    /// reloc/trap/etc. records ready for use.
+    pub fn emit(&self) -> MachBuffer<I>
    where
-        I: MachInstEmit<MachSectionSize>,
+        I: MachInstEmit,
    {
-        // Compute fallthrough block, indexed by block.
-        let num_final_blocks = self.final_block_order.len();
-        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
-        for i in 0..(num_final_blocks - 1) {
-            let from = self.final_block_order[i];
-            let to = self.final_block_order[i + 1];
-            block_fallthrough[from as usize] = Some(to);
-        }
+        let mut buffer = MachBuffer::new();
+        let mut state = Default::default();

-        // Pass over VCode instructions and finalize two-way branches into
-        // one-way branches with fallthrough.
+        buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); // first N MachLabels are simply block indices.
+
+        let flags = self.abi.flags();
+        let mut cur_srcloc = None;
        for block in 0..self.num_blocks() {
-            let next_block = block_fallthrough[block];
-            let (start, end) = self.block_ranges[block];
-
-            for iix in start..end {
-                let insn = &mut self.insts[iix as usize];
-                insn.with_fallthrough_block(next_block);
-            }
-        }
-
-        let flags = self.abi.flags();
-
-        // Compute block offsets.
-        let mut code_section = MachSectionSize::new(0);
-        let mut block_offsets = vec![0; self.num_blocks()];
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            block_offsets[block as usize] = code_section.offset;
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags);
-            }
-        }
-
-        // We now have the section layout.
-        self.final_block_offsets = block_offsets;
-        self.code_size = code_section.size();
-
-        // Update branches with known block offsets. This looks like the
-        // traversal above, but (i) does not update block_offsets, rather uses
-        // it (so forward references are now possible), and (ii) mutates the
-        // instructions.
-        let mut code_section = MachSectionSize::new(0);
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize]
-                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags);
-            }
-        }
-    }
-
-    /// Emit the instructions to a list of sections.
-    pub fn emit(&self) -> MachSections
-    where
-        I: MachInstEmit<MachSection>,
-    {
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, self.code_size);
-        let code_section = sections.get_section(code_idx);
-
-        let flags = self.abi.flags();
-        let mut cur_srcloc = SourceLoc::default();
-        for &block in &self.final_block_order {
-            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
-            while new_offset > code_section.cur_offset_from_start() {
+            let block = block as BlockIndex;
+            let new_offset = I::align_basic_block(buffer.cur_offset());
+            while new_offset > buffer.cur_offset() {
                // Pad with NOPs up to the aligned block offset.
-                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags);
+                let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
+                nop.emit(&mut buffer, flags, &mut Default::default());
            }
-            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+            assert_eq!(buffer.cur_offset(), new_offset);

            let (start, end) = self.block_ranges[block as usize];
+            buffer.bind_label(MachLabel::from_block(block));
            for iix in start..end {
                let srcloc = self.srclocs[iix as usize];
-                if srcloc != cur_srcloc {
-                    if !cur_srcloc.is_default() {
-                        code_section.end_srcloc();
+                if cur_srcloc != Some(srcloc) {
+                    if cur_srcloc.is_some() {
+                        buffer.end_srcloc();
                    }
-                    if !srcloc.is_default() {
-                        code_section.start_srcloc(srcloc);
-                    }
-                    cur_srcloc = srcloc;
+                    buffer.start_srcloc(srcloc);
+                    cur_srcloc = Some(srcloc);
                }

-                self.insts[iix as usize].emit(code_section, flags);
+                self.insts[iix as usize].emit(&mut buffer, flags, &mut state);
            }

-            if !cur_srcloc.is_default() {
-                code_section.end_srcloc();
-                cur_srcloc = SourceLoc::default();
+            if cur_srcloc.is_some() {
+                buffer.end_srcloc();
+                cur_srcloc = None;
+            }
+
+            // Do we need an island? Get the worst-case size of the next BB and see if, having
+            // emitted that many bytes, we will be beyond the deadline.
+            if block < (self.num_blocks() - 1) as BlockIndex {
+                let next_block = block + 1;
+                let next_block_range = self.block_ranges[next_block as usize];
+                let next_block_size = next_block_range.1 - next_block_range.0;
+                let worst_case_next_bb = I::worst_case_size() * next_block_size;
+                if buffer.island_needed(worst_case_next_bb) {
+                    buffer.emit_island();
+                }
            }
        }

-        sections
+        buffer
    }

    /// Get the IR block for a BlockIndex, if one exists.
    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
-        if (block as usize) < self.bb_by_block.len() {
-            Some(self.bb_by_block[block as usize])
-        } else {
-            None
-        }
+        self.block_order.lowered_order()[block as usize].orig_block()
    }
 }

@ -629,13 +441,9 @@ impl<I: VCodeInst> RegallocFunction for VCode<I> {
        Range::new(InstIx::new(start), (end - start) as usize)
    }

-    fn block_succs(&self, block: BlockIx) -> Vec<BlockIx> {
+    fn block_succs(&self, block: BlockIx) -> Cow<[BlockIx]> {
        let (start, end) = self.block_succ_range[block.get() as usize];
-        self.block_succs[start..end]
-            .iter()
-            .cloned()
-            .map(BlockIx::new)
-            .collect()
+        Cow::Borrowed(&self.block_succs[start..end])
    }

    fn is_ret(&self, insn: InstIx) -> bool {
@ -649,7 +457,7 @@ impl<I: VCodeInst> RegallocFunction for VCode<I> {
        insn.get_regs(collector)
    }

-    fn map_regs(insn: &mut I, mapper: &RegUsageMapper) {
+    fn map_regs<RUM: RegUsageMapper>(insn: &mut I, mapper: &RUM) {
        insn.map_regs(mapper);
    }

@ -702,12 +510,11 @@ impl<I: VCodeInst> fmt::Debug for VCode<I> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "VCode_Debug {{")?;
        writeln!(f, "  Entry block: {}", self.entry)?;
-        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;

        for block in 0..self.num_blocks() {
            writeln!(f, "Block {}:", block,)?;
            for succ in self.succs(block as BlockIndex) {
-                writeln!(f, "  (successor: Block {})", succ)?;
+                writeln!(f, "  (successor: Block {})", succ.get())?;
            }
            let (start, end) = self.block_ranges[block];
            writeln!(f, "  (instruction range: {} .. {})", start, end)?;
@ -726,52 +533,21 @@ impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        use std::fmt::Write;

-        // Calculate an order in which to display the blocks.  This is the same
-        // as final_block_order, but also includes blocks which are in the
-        // representation but not in final_block_order.
-        let mut display_order = Vec::<usize>::new();
-        // First display blocks in `final_block_order`
-        for bix in &self.final_block_order {
-            assert!((*bix as usize) < self.num_blocks());
-            display_order.push(*bix as usize);
-        }
-        // Now also take care of those not listed in `final_block_order`.
-        // This is quadratic, but it's also debug-only code.
-        for bix in 0..self.num_blocks() {
-            if display_order.contains(&bix) {
-                continue;
-            }
-            display_order.push(bix);
-        }
-
        let mut s = String::new();
        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
-        write!(
-            &mut s,
-            "  Final block order: {:?}\n",
-            self.final_block_order
-        )
-        .unwrap();

        for i in 0..self.num_blocks() {
-            let block = display_order[i];
+            let block = i as BlockIndex;

-            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
-            {
-                "** OMITTED **"
-            } else {
-                ""
-            };
-
-            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
-            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+            write!(&mut s, "Block {}:\n", block).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block) {
                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
            }
-            for succ in self.succs(block as BlockIndex) {
-                write!(&mut s, "  (successor: Block {})\n", succ).unwrap();
+            for succ in self.succs(block) {
+                write!(&mut s, "  (successor: Block {})\n", succ.get()).unwrap();
            }
-            let (start, end) = self.block_ranges[block];
+            let (start, end) = self.block_ranges[block as usize];
            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
            for inst in start..end {
                write!(
--- a/third_party/rust/cranelift-codegen/src/num_uses.rs
+++ b/third_party/rust/cranelift-codegen/src/num_uses.rs
@ -1,52 +0,0 @@
-//! A pass that computes the number of uses of any given instruction.
-
-use crate::entity::SecondaryMap;
-use crate::ir::dfg::ValueDef;
-use crate::ir::Value;
-use crate::ir::{DataFlowGraph, Function, Inst};
-
-/// Auxiliary data structure that counts the number of uses of any given
-/// instruction in a Function. This is used during instruction selection
-/// to essentially do incremental DCE: when an instruction is no longer
-/// needed because its computation has been isel'd into another machine
-/// instruction at every use site, we can skip it.
-#[derive(Clone, Debug)]
-pub struct NumUses {
-    uses: SecondaryMap<Inst, u32>,
-}
-
-impl NumUses {
-    fn new() -> NumUses {
-        NumUses {
-            uses: SecondaryMap::with_default(0),
-        }
-    }
-
-    /// Compute the NumUses analysis result for a function.
-    pub fn compute(func: &Function) -> NumUses {
-        let mut uses = NumUses::new();
-        for bb in func.layout.blocks() {
-            for inst in func.layout.block_insts(bb) {
-                for arg in func.dfg.inst_args(inst) {
-                    let v = func.dfg.resolve_aliases(*arg);
-                    uses.add_value(&func.dfg, v);
-                }
-            }
-        }
-        uses
-    }
-
-    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
-        match dfg.value_def(v) {
-            ValueDef::Result(inst, _) => {
-                self.uses[inst] += 1;
-            }
-            _ => {}
-        }
-    }
-
-    /// Take the complete uses map, consuming this analysis result.
-    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
-        self.uses
-    }
-}
--- a/third_party/rust/cranelift-codegen/src/peepmatic.rs
+++ b/third_party/rust/cranelift-codegen/src/peepmatic.rs
@ -0,0 +1,887 @@
+//! Glue for working with `peepmatic`-generated peephole optimizers.
+
+use crate::cursor::{Cursor, FuncCursor};
+use crate::ir::{
+    dfg::DataFlowGraph,
+    entities::{Inst, Value},
+    immediates::{Imm64, Uimm64},
+    instructions::{InstructionData, Opcode},
+    types, InstBuilder,
+};
+use crate::isa::TargetIsa;
+use cranelift_codegen_shared::condcodes::IntCC;
+use peepmatic_runtime::{
+    cc::ConditionCode,
+    instruction_set::InstructionSet,
+    operator::Operator,
+    part::{Constant, Part},
+    paths::Path,
+    r#type::{BitWidth, Kind, Type},
+    PeepholeOptimizations, PeepholeOptimizer,
+};
+use std::boxed::Box;
+use std::convert::{TryFrom, TryInto};
+use std::ptr;
+use std::sync::atomic::{AtomicPtr, Ordering};
+
+/// Get the `preopt.peepmatic` peephole optimizer.
+pub(crate) fn preopt<'a, 'b>(
+    isa: &'b dyn TargetIsa,
+) -> PeepholeOptimizer<'static, 'a, &'b dyn TargetIsa> {
+    static SERIALIZED: &[u8] = include_bytes!("preopt.serialized");
+
+    // Once initialized, this must never be re-assigned. The initialized value
+    // is semantically "static data" and is intentionally leaked for the whole
+    // program's lifetime.
+    static DESERIALIZED: AtomicPtr<PeepholeOptimizations> = AtomicPtr::new(ptr::null_mut());
+
+    // If `DESERIALIZED` has already been initialized, then just use it.
+    let ptr = DESERIALIZED.load(Ordering::SeqCst);
+    if let Some(peep_opts) = unsafe { ptr.as_ref() } {
+        return peep_opts.optimizer(isa);
+    }
+
+    // Otherwise, if `DESERIALIZED` hasn't been initialized, then we need to
+    // deserialize the peephole optimizations and initialize it. However,
+    // another thread could be doing the same thing concurrently, so there is a
+    // race to see who initializes `DESERIALIZED` first, and we need to be
+    // prepared to both win or lose that race.
+    let peep_opts = PeepholeOptimizations::deserialize(SERIALIZED)
+        .expect("should always be able to deserialize `preopt.serialized`");
+    let peep_opts = Box::into_raw(Box::new(peep_opts));
+
+    // Only update `DESERIALIZE` if it is still null, attempting to perform the
+    // one-time transition from null -> non-null.
+    if DESERIALIZED
+        .compare_and_swap(ptr::null_mut(), peep_opts, Ordering::SeqCst)
+        .is_null()
+    {
+        // We won the race to initialize `DESERIALIZED`.
+        debug_assert_eq!(DESERIALIZED.load(Ordering::SeqCst), peep_opts);
+        let peep_opts = unsafe { &*peep_opts };
+        return peep_opts.optimizer(isa);
+    }
+
+    // We lost the race to initialize `DESERIALIZED`. Drop our no-longer-needed
+    // instance of `peep_opts` and get the pointer to the instance that won the
+    // race.
+    let _ = unsafe { Box::from_raw(peep_opts) };
+    let peep_opts = DESERIALIZED.load(Ordering::SeqCst);
+    let peep_opts = unsafe { peep_opts.as_ref().unwrap() };
+    peep_opts.optimizer(isa)
+}
+
+/// Either a `Value` or an `Inst`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum ValueOrInst {
+    Value(Value),
+    Inst(Inst),
+}
+
+impl ValueOrInst {
+    /// Get the underlying `Value` if any.
+    pub fn value(&self) -> Option<Value> {
+        match *self {
+            Self::Value(v) => Some(v),
+            Self::Inst(_) => None,
+        }
+    }
+
+    /// Get the underlying `Inst` if any.
+    pub fn inst(&self) -> Option<Inst> {
+        match *self {
+            Self::Inst(i) => Some(i),
+            Self::Value(_) => None,
+        }
+    }
+
+    /// Unwrap the underlying `Value`, panicking if it is not a `Value.
+    pub fn unwrap_value(&self) -> Value {
+        self.value().unwrap()
+    }
+
+    /// Unwrap the underlying `Inst`, panicking if it is not a `Inst.
+    pub fn unwrap_inst(&self) -> Inst {
+        self.inst().unwrap()
+    }
+
+    /// Is this a `Value`?
+    pub fn is_value(&self) -> bool {
+        self.value().is_some()
+    }
+
+    /// Is this an `Inst`?
+    pub fn is_inst(&self) -> bool {
+        self.inst().is_some()
+    }
+
+    fn resolve_inst(&self, dfg: &DataFlowGraph) -> Option<Inst> {
+        match *self {
+            ValueOrInst::Inst(i) => Some(i),
+            ValueOrInst::Value(v) => dfg.value_def(v).inst(),
+        }
+    }
+
+    fn result_bit_width(&self, dfg: &DataFlowGraph) -> u8 {
+        match *self {
+            ValueOrInst::Value(v) => dfg.value_type(v).bits().try_into().unwrap(),
+            ValueOrInst::Inst(inst) => {
+                let result = dfg.first_result(inst);
+                dfg.value_type(result).bits().try_into().unwrap()
+            }
+        }
+    }
+
+    fn to_constant(&self, pos: &mut FuncCursor) -> Option<Constant> {
+        let inst = self.resolve_inst(&pos.func.dfg)?;
+        match pos.func.dfg[inst] {
+            InstructionData::UnaryImm {
+                opcode: Opcode::Iconst,
+                imm,
+            } => {
+                let width = self.result_bit_width(&pos.func.dfg).try_into().unwrap();
+                let x: i64 = imm.into();
+                Some(Constant::Int(x as u64, width))
+            }
+            InstructionData::UnaryBool {
+                opcode: Opcode::Bconst,
+                imm,
+            } => {
+                let width = self.result_bit_width(&pos.func.dfg).try_into().unwrap();
+                Some(Constant::Bool(imm, width))
+            }
+            _ => None,
+        }
+    }
+}
+
+impl From<Value> for ValueOrInst {
+    fn from(v: Value) -> ValueOrInst {
+        ValueOrInst::Value(v)
+    }
+}
+
+impl From<Inst> for ValueOrInst {
+    fn from(i: Inst) -> ValueOrInst {
+        ValueOrInst::Inst(i)
+    }
+}
+
+/// Get the fixed bit width of `bit_width`, or if it is polymorphic, the bit
+/// width of `root`.
+fn bit_width(dfg: &DataFlowGraph, bit_width: BitWidth, root: Inst) -> u8 {
+    bit_width.fixed_width().unwrap_or_else(|| {
+        let tyvar = dfg.ctrl_typevar(root);
+        let ty = dfg.compute_result_type(root, 0, tyvar).unwrap();
+        u8::try_from(ty.bits()).unwrap()
+    })
+}
+
+/// Convert the constant `c` into an instruction.
+fn const_to_value<'a>(builder: impl InstBuilder<'a>, c: Constant, root: Inst) -> Value {
+    match c {
+        Constant::Bool(b, width) => {
+            let width = bit_width(builder.data_flow_graph(), width, root);
+            let ty = match width {
+                1 => types::B1,
+                8 => types::B8,
+                16 => types::B16,
+                32 => types::B32,
+                64 => types::B64,
+                128 => types::B128,
+                _ => unreachable!(),
+            };
+            builder.bconst(ty, b)
+        }
+        Constant::Int(x, width) => {
+            let width = bit_width(builder.data_flow_graph(), width, root);
+            let ty = match width {
+                8 => types::I8,
+                16 => types::I16,
+                32 => types::I32,
+                64 => types::I64,
+                128 => types::I128,
+                _ => unreachable!(),
+            };
+            builder.iconst(ty, x as i64)
+        }
+    }
+}
+
+fn part_to_value(pos: &mut FuncCursor, root: Inst, part: Part<ValueOrInst>) -> Option<Value> {
+    match part {
+        Part::Instruction(ValueOrInst::Inst(inst)) => {
+            pos.func.dfg.inst_results(inst).first().copied()
+        }
+        Part::Instruction(ValueOrInst::Value(v)) => Some(v),
+        Part::Constant(c) => Some(const_to_value(pos.ins(), c, root)),
+        Part::ConditionCode(_) => None,
+    }
+}
+
+impl Opcode {
+    fn to_peepmatic_operator(&self) -> Option<Operator> {
+        macro_rules! convert {
+            ( $( $op:ident $(,)* )* ) => {
+                match self {
+                    $( Self::$op => Some(Operator::$op), )*
+                    _ => None,
+                }
+            }
+        }
+
+        convert!(
+            AdjustSpDown,
+            AdjustSpDownImm,
+            Band,
+            BandImm,
+            Bconst,
+            Bint,
+            Bor,
+            BorImm,
+            Brnz,
+            Brz,
+            Bxor,
+            BxorImm,
+            Iadd,
+            IaddImm,
+            Icmp,
+            IcmpImm,
+            Iconst,
+            Ifcmp,
+            IfcmpImm,
+            Imul,
+            ImulImm,
+            Ireduce,
+            IrsubImm,
+            Ishl,
+            IshlImm,
+            Isub,
+            Rotl,
+            RotlImm,
+            Rotr,
+            RotrImm,
+            Sdiv,
+            SdivImm,
+            Select,
+            Sextend,
+            Srem,
+            SremImm,
+            Sshr,
+            SshrImm,
+            Trapnz,
+            Trapz,
+            Udiv,
+            UdivImm,
+            Uextend,
+            Urem,
+            UremImm,
+            Ushr,
+            UshrImm,
+        )
+    }
+}
+
+impl TryFrom<Constant> for Imm64 {
+    type Error = &'static str;
+
+    fn try_from(c: Constant) -> Result<Self, Self::Error> {
+        match c {
+            Constant::Int(x, _) => Ok(Imm64::from(x as i64)),
+            Constant::Bool(..) => Err("cannot create Imm64 from Constant::Bool"),
+        }
+    }
+}
+
+impl Into<Constant> for Imm64 {
+    #[inline]
+    fn into(self) -> Constant {
+        let x: i64 = self.into();
+        Constant::Int(x as _, BitWidth::SixtyFour)
+    }
+}
+
+impl Into<Part<ValueOrInst>> for Imm64 {
+    #[inline]
+    fn into(self) -> Part<ValueOrInst> {
+        let c: Constant = self.into();
+        c.into()
+    }
+}
+
+fn part_to_imm64(pos: &mut FuncCursor, part: Part<ValueOrInst>) -> Imm64 {
+    return match part {
+        Part::Instruction(x) => match x.to_constant(pos).unwrap_or_else(|| cannot_convert()) {
+            Constant::Int(x, _) => (x as i64).into(),
+            Constant::Bool(..) => cannot_convert(),
+        },
+        Part::Constant(Constant::Int(x, _)) => (x as i64).into(),
+        Part::ConditionCode(_) | Part::Constant(Constant::Bool(..)) => cannot_convert(),
+    };
+
+    #[inline(never)]
+    #[cold]
+    fn cannot_convert() -> ! {
+        panic!("cannot convert part into `Imm64`")
+    }
+}
+
+impl Into<Constant> for Uimm64 {
+    #[inline]
+    fn into(self) -> Constant {
+        let x: u64 = self.into();
+        Constant::Int(x, BitWidth::SixtyFour)
+    }
+}
+
+impl Into<Part<ValueOrInst>> for Uimm64 {
+    #[inline]
+    fn into(self) -> Part<ValueOrInst> {
+        let c: Constant = self.into();
+        c.into()
+    }
+}
+
+fn peepmatic_to_intcc(cc: ConditionCode) -> IntCC {
+    match cc {
+        ConditionCode::Eq => IntCC::Equal,
+        ConditionCode::Ne => IntCC::NotEqual,
+        ConditionCode::Slt => IntCC::SignedLessThan,
+        ConditionCode::Sle => IntCC::SignedGreaterThanOrEqual,
+        ConditionCode::Sgt => IntCC::SignedGreaterThan,
+        ConditionCode::Sge => IntCC::SignedLessThanOrEqual,
+        ConditionCode::Ult => IntCC::UnsignedLessThan,
+        ConditionCode::Uge => IntCC::UnsignedGreaterThanOrEqual,
+        ConditionCode::Ugt => IntCC::UnsignedGreaterThan,
+        ConditionCode::Ule => IntCC::UnsignedLessThanOrEqual,
+        ConditionCode::Of => IntCC::Overflow,
+        ConditionCode::Nof => IntCC::NotOverflow,
+    }
+}
+
+fn intcc_to_peepmatic(cc: IntCC) -> ConditionCode {
+    match cc {
+        IntCC::Equal => ConditionCode::Eq,
+        IntCC::NotEqual => ConditionCode::Ne,
+        IntCC::SignedLessThan => ConditionCode::Slt,
+        IntCC::SignedGreaterThanOrEqual => ConditionCode::Sle,
+        IntCC::SignedGreaterThan => ConditionCode::Sgt,
+        IntCC::SignedLessThanOrEqual => ConditionCode::Sge,
+        IntCC::UnsignedLessThan => ConditionCode::Ult,
+        IntCC::UnsignedGreaterThanOrEqual => ConditionCode::Uge,
+        IntCC::UnsignedGreaterThan => ConditionCode::Ugt,
+        IntCC::UnsignedLessThanOrEqual => ConditionCode::Ule,
+        IntCC::Overflow => ConditionCode::Of,
+        IntCC::NotOverflow => ConditionCode::Nof,
+    }
+}
+
+fn get_immediate(dfg: &DataFlowGraph, inst: Inst, i: usize) -> Part<ValueOrInst> {
+    return match dfg[inst] {
+        InstructionData::BinaryImm64 { imm, .. } if i == 0 => imm.into(),
+        InstructionData::BranchIcmp { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::BranchInt { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::IntCompare { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::IntCompareImm { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::IntCompareImm { imm, .. } if i == 1 => imm.into(),
+        InstructionData::IntCond { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::IntCondTrap { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::IntSelect { cond, .. } if i == 0 => intcc_to_peepmatic(cond).into(),
+        InstructionData::UnaryBool { imm, .. } if i == 0 => {
+            Constant::Bool(imm, BitWidth::Polymorphic).into()
+        }
+        InstructionData::UnaryImm { imm, .. } if i == 0 => imm.into(),
+        ref otherwise => unsupported(otherwise),
+    };
+
+    #[inline(never)]
+    #[cold]
+    fn unsupported(data: &InstructionData) -> ! {
+        panic!("unsupported instruction data: {:?}", data)
+    }
+}
+
+fn get_argument(dfg: &DataFlowGraph, inst: Inst, i: usize) -> Option<Value> {
+    dfg.inst_args(inst).get(i).copied()
+}
+
+fn peepmatic_ty_to_ir_ty(ty: Type, dfg: &DataFlowGraph, root: Inst) -> types::Type {
+    match (ty.kind, bit_width(dfg, ty.bit_width, root)) {
+        (Kind::Int, 8) => types::I8,
+        (Kind::Int, 16) => types::I16,
+        (Kind::Int, 32) => types::I32,
+        (Kind::Int, 64) => types::I64,
+        (Kind::Int, 128) => types::I128,
+        (Kind::Bool, 1) => types::B1,
+        (Kind::Bool, 8) => types::I8,
+        (Kind::Bool, 16) => types::I16,
+        (Kind::Bool, 32) => types::I32,
+        (Kind::Bool, 64) => types::I64,
+        (Kind::Bool, 128) => types::I128,
+        _ => unreachable!(),
+    }
+}
+
+// NB: the unsafe contract we must uphold here is that our implementation of
+// `instruction_result_bit_width` must always return a valid, non-zero bit
+// width.
+unsafe impl<'a, 'b> InstructionSet<'b> for &'a dyn TargetIsa {
+    type Context = FuncCursor<'b>;
+
+    type Instruction = ValueOrInst;
+
+    fn replace_instruction(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        old: ValueOrInst,
+        new: Part<ValueOrInst>,
+    ) -> ValueOrInst {
+        log::trace!("replace {:?} with {:?}", old, new);
+        let old_inst = old.resolve_inst(&pos.func.dfg).unwrap();
+
+        // Try to convert `new` to an instruction, because we prefer replacing
+        // an old instruction with a new one wholesale. However, if the
+        // replacement cannot be converted to an instruction (e.g. the
+        // right-hand side is a block/function parameter value) then we change
+        // the old instruction's result to an alias of the new value.
+        let new_inst = match new {
+            Part::Instruction(ValueOrInst::Inst(inst)) => Some(inst),
+            Part::Instruction(ValueOrInst::Value(_)) => {
+                // Do not try and follow the value definition. If we transplant
+                // this value's instruction, and there are other uses of this
+                // value, then we could mess up ordering between instructions.
+                None
+            }
+            Part::Constant(c) => {
+                let v = const_to_value(pos.ins(), c, old_inst);
+                let inst = pos.func.dfg.value_def(v).unwrap_inst();
+                Some(inst)
+            }
+            Part::ConditionCode(_) => None,
+        };
+
+        match new_inst {
+            Some(new_inst) => {
+                pos.func.transplant_inst(old_inst, new_inst);
+                debug_assert_eq!(pos.current_inst(), Some(old_inst));
+                old_inst.into()
+            }
+            None => {
+                let new_value = part_to_value(pos, old_inst, new).unwrap();
+
+                let old_results = pos.func.dfg.detach_results(old_inst);
+                let old_results = old_results.as_slice(&pos.func.dfg.value_lists);
+                assert_eq!(old_results.len(), 1);
+                let old_value = old_results[0];
+
+                pos.func.dfg.change_to_alias(old_value, new_value);
+                pos.func.dfg.replace(old_inst).nop();
+
+                new_value.into()
+            }
+        }
+    }
+
+    fn get_part_at_path(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        root: ValueOrInst,
+        path: Path,
+    ) -> Option<Part<ValueOrInst>> {
+        // The root is path [0].
+        debug_assert!(!path.0.is_empty());
+        debug_assert_eq!(path.0[0], 0);
+
+        let mut part = Part::Instruction(root);
+        for p in path.0[1..].iter().copied() {
+            let inst = part.as_instruction()?.resolve_inst(&pos.func.dfg)?;
+            let operator = pos.func.dfg[inst].opcode().to_peepmatic_operator()?;
+
+            if p < operator.immediates_arity() {
+                part = get_immediate(&pos.func.dfg, inst, p as usize);
+                continue;
+            }
+
+            let arg = p - operator.immediates_arity();
+            let arg = arg as usize;
+            let value = get_argument(&pos.func.dfg, inst, arg)?;
+            part = Part::Instruction(value.into());
+        }
+
+        log::trace!("get_part_at_path({:?}) = {:?}", path, part);
+        Some(part)
+    }
+
+    fn operator(&self, pos: &mut FuncCursor<'b>, value_or_inst: ValueOrInst) -> Option<Operator> {
+        let inst = value_or_inst.resolve_inst(&pos.func.dfg)?;
+        pos.func.dfg[inst].opcode().to_peepmatic_operator()
+    }
+
+    fn make_inst_1(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        root: ValueOrInst,
+        operator: Operator,
+        r#type: Type,
+        a: Part<ValueOrInst>,
+    ) -> ValueOrInst {
+        log::trace!("make_inst_1: {:?}({:?})", operator, a);
+
+        let root = root.resolve_inst(&pos.func.dfg).unwrap();
+        match operator {
+            Operator::AdjustSpDown => {
+                let a = part_to_value(pos, root, a).unwrap();
+                pos.ins().adjust_sp_down(a).into()
+            }
+            Operator::AdjustSpDownImm => {
+                let c = a.unwrap_constant();
+                let imm = Imm64::try_from(c).unwrap();
+                pos.ins().adjust_sp_down_imm(imm).into()
+            }
+            Operator::Bconst => {
+                let c = a.unwrap_constant();
+                let val = const_to_value(pos.ins(), c, root);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Bint => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let ty = peepmatic_ty_to_ir_ty(r#type, &pos.func.dfg, root);
+                let val = pos.ins().bint(ty, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Brnz => {
+                let a = part_to_value(pos, root, a).unwrap();
+
+                // NB: branching instructions must be the root of an
+                // optimization's right-hand side, so we get the destination
+                // block and arguments from the left-hand side's root. Peepmatic
+                // doesn't currently represent labels or varargs.
+                let block = pos.func.dfg[root].branch_destination().unwrap();
+                let args = pos.func.dfg.inst_args(root)[1..].to_vec();
+
+                pos.ins().brnz(a, block, &args).into()
+            }
+            Operator::Brz => {
+                let a = part_to_value(pos, root, a).unwrap();
+
+                // See the comment in the `Operator::Brnz` match argm.
+                let block = pos.func.dfg[root].branch_destination().unwrap();
+                let args = pos.func.dfg.inst_args(root)[1..].to_vec();
+
+                pos.ins().brz(a, block, &args).into()
+            }
+            Operator::Iconst => {
+                let a = a.unwrap_constant();
+                let val = const_to_value(pos.ins(), a, root);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Ireduce => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let ty = peepmatic_ty_to_ir_ty(r#type, &pos.func.dfg, root);
+                let val = pos.ins().ireduce(ty, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Sextend => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let ty = peepmatic_ty_to_ir_ty(r#type, &pos.func.dfg, root);
+                let val = pos.ins().sextend(ty, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Trapnz => {
+                let a = part_to_value(pos, root, a).unwrap();
+
+                // NB: similar to branching instructions (see comment in the
+                // `Operator::Brnz` match arm) trapping instructions must be the
+                // root of an optimization's right-hand side, and we get the
+                // trap code from the root of the left-hand side. Peepmatic
+                // doesn't currently represent trap codes.
+                let code = pos.func.dfg[root].trap_code().unwrap();
+
+                pos.ins().trapnz(a, code).into()
+            }
+            Operator::Trapz => {
+                let a = part_to_value(pos, root, a).unwrap();
+                // See comment in the `Operator::Trapnz` match arm.
+                let code = pos.func.dfg[root].trap_code().unwrap();
+                pos.ins().trapz(a, code).into()
+            }
+            Operator::Uextend => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let ty = peepmatic_ty_to_ir_ty(r#type, &pos.func.dfg, root);
+                let val = pos.ins().uextend(ty, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn make_inst_2(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        root: ValueOrInst,
+        operator: Operator,
+        _: Type,
+        a: Part<ValueOrInst>,
+        b: Part<ValueOrInst>,
+    ) -> ValueOrInst {
+        log::trace!("make_inst_2: {:?}({:?}, {:?})", operator, a, b);
+
+        let root = root.resolve_inst(&pos.func.dfg).unwrap();
+        match operator {
+            Operator::Band => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().band(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::BandImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().band_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Bor => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().bor(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::BorImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().bor_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Bxor => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().bxor(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::BxorImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().bxor_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Iadd => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().iadd(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::IaddImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().iadd_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Ifcmp => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ifcmp(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::IfcmpImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ifcmp_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Imul => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().imul(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::ImulImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().imul_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::IrsubImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().irsub_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Ishl => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ishl(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::IshlImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ishl_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Isub => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().isub(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Rotl => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().rotl(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::RotlImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().rotl_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Rotr => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().rotr(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::RotrImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().rotr_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Sdiv => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().sdiv(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::SdivImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().sdiv_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Srem => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().srem(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::SremImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().srem_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Sshr => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().sshr(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::SshrImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().sshr_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Udiv => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().udiv(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::UdivImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().udiv_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Urem => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().urem(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::UremImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().urem_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Ushr => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ushr(a, b);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::UshrImm => {
+                let a = part_to_imm64(pos, a);
+                let b = part_to_value(pos, root, b).unwrap();
+                let val = pos.ins().ushr_imm(b, a);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn make_inst_3(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        root: ValueOrInst,
+        operator: Operator,
+        _: Type,
+        a: Part<ValueOrInst>,
+        b: Part<ValueOrInst>,
+        c: Part<ValueOrInst>,
+    ) -> ValueOrInst {
+        log::trace!("make_inst_3: {:?}({:?}, {:?}, {:?})", operator, a, b, c);
+
+        let root = root.resolve_inst(&pos.func.dfg).unwrap();
+        match operator {
+            Operator::Icmp => {
+                let cond = a.unwrap_condition_code();
+                let cond = peepmatic_to_intcc(cond);
+                let b = part_to_value(pos, root, b).unwrap();
+                let c = part_to_value(pos, root, c).unwrap();
+                let val = pos.ins().icmp(cond, b, c);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::IcmpImm => {
+                let cond = a.unwrap_condition_code();
+                let cond = peepmatic_to_intcc(cond);
+                let imm = part_to_imm64(pos, b);
+                let c = part_to_value(pos, root, c).unwrap();
+                let val = pos.ins().icmp_imm(cond, c, imm);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            Operator::Select => {
+                let a = part_to_value(pos, root, a).unwrap();
+                let b = part_to_value(pos, root, b).unwrap();
+                let c = part_to_value(pos, root, c).unwrap();
+                let val = pos.ins().select(a, b, c);
+                pos.func.dfg.value_def(val).unwrap_inst().into()
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn instruction_to_constant(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        value_or_inst: ValueOrInst,
+    ) -> Option<Constant> {
+        value_or_inst.to_constant(pos)
+    }
+
+    fn instruction_result_bit_width(
+        &self,
+        pos: &mut FuncCursor<'b>,
+        value_or_inst: ValueOrInst,
+    ) -> u8 {
+        value_or_inst.result_bit_width(&pos.func.dfg)
+    }
+
+    fn native_word_size_in_bits(&self, _pos: &mut FuncCursor<'b>) -> u8 {
+        self.pointer_bits()
+    }
+}
--- a/third_party/rust/cranelift-codegen/src/postopt.rs
+++ b/third_party/rust/cranelift-codegen/src/postopt.rs
@ -271,6 +271,42 @@ fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &dyn TargetI
                        .replace(inst)
                        .sload32_complex(info.flags, &args, info.offset);
                }
+                Opcode::Uload8x8 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .uload8x8_complex(info.flags, &args, info.offset);
+                }
+                Opcode::Sload8x8 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .sload8x8_complex(info.flags, &args, info.offset);
+                }
+                Opcode::Uload16x4 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .uload16x4_complex(info.flags, &args, info.offset);
+                }
+                Opcode::Sload16x4 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .sload16x4_complex(info.flags, &args, info.offset);
+                }
+                Opcode::Uload32x2 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .uload32x2_complex(info.flags, &args, info.offset);
+                }
+                Opcode::Sload32x2 => {
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .sload32x2_complex(info.flags, &args, info.offset);
+                }
                Opcode::Store => {
                    pos.func.dfg.replace(inst).store_complex(
                        info.flags,
@ -305,7 +341,7 @@ fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &dyn TargetI
                }
                _ => panic!("Unsupported load or store opcode"),
            },
-            InstructionData::BinaryImm {
+            InstructionData::BinaryImm64 {
                opcode: Opcode::IaddImm,
                arg,
                imm,
--- a/third_party/rust/cranelift-codegen/src/preopt.peepmatic
+++ b/third_party/rust/cranelift-codegen/src/preopt.peepmatic
@ -0,0 +1,193 @@
+;; Apply basic simplifications.
+;;
+;; This folds constants with arithmetic to form `_imm` instructions, and other
+;; minor simplifications.
+;;
+;; Doesn't apply some simplifications if the native word width (in bytes) is
+;; smaller than the controlling type's width of the instruction. This would
+;; result in an illegal instruction that would likely be expanded back into an
+;; instruction on smaller types with the same initial opcode, creating
+;; unnecessary churn.
+
+;; Binary instructions whose second argument is constant.
+(=> (when (iadd $x $C)
+      (fits-in-native-word $C))
+    (iadd_imm $C $x))
+(=> (when (imul $x $C)
+      (fits-in-native-word $C))
+    (imul_imm $C $x))
+(=> (when (sdiv $x $C)
+      (fits-in-native-word $C))
+    (sdiv_imm $C $x))
+(=> (when (udiv $x $C)
+      (fits-in-native-word $C))
+    (udiv_imm $C $x))
+(=> (when (srem $x $C)
+      (fits-in-native-word $C))
+    (srem_imm $C $x))
+(=> (when (urem $x $C)
+      (fits-in-native-word $C))
+    (urem_imm $C $x))
+(=> (when (band $x $C)
+      (fits-in-native-word $C))
+    (band_imm $C $x))
+(=> (when (bor $x $C)
+      (fits-in-native-word $C))
+    (bor_imm $C $x))
+(=> (when (bxor $x $C)
+      (fits-in-native-word $C))
+    (bxor_imm $C $x))
+(=> (when (rotl $x $C)
+      (fits-in-native-word $C))
+    (rotl_imm $C $x))
+(=> (when (rotr $x $C)
+      (fits-in-native-word $C))
+    (rotr_imm $C $x))
+(=> (when (ishl $x $C)
+      (fits-in-native-word $C))
+    (ishl_imm $C $x))
+(=> (when (ushr $x $C)
+      (fits-in-native-word $C))
+    (ushr_imm $C $x))
+(=> (when (sshr $x $C)
+      (fits-in-native-word $C))
+    (sshr_imm $C $x))
+(=> (when (isub $x $C)
+      (fits-in-native-word $C))
+    (iadd_imm $(neg $C) $x))
+(=> (when (ifcmp $x $C)
+      (fits-in-native-word $C))
+    (ifcmp_imm $C $x))
+(=> (when (icmp $cond $x $C)
+      (fits-in-native-word $C))
+    (icmp_imm $cond $C $x))
+
+;; Binary instructions whose first operand is constant.
+(=> (when (iadd $C $x)
+      (fits-in-native-word $C))
+    (iadd_imm $C $x))
+(=> (when (imul $C $x)
+      (fits-in-native-word $C))
+    (imul_imm $C $x))
+(=> (when (band $C $x)
+      (fits-in-native-word $C))
+    (band_imm $C $x))
+(=> (when (bor $C $x)
+      (fits-in-native-word $C))
+    (bor_imm $C $x))
+(=> (when (bxor $C $x)
+      (fits-in-native-word $C))
+    (bxor_imm $C $x))
+(=> (when (isub $C $x)
+      (fits-in-native-word $C))
+    (irsub_imm $C $x))
+
+;; Unary instructions whose operand is constant.
+(=> (adjust_sp_down $C) (adjust_sp_down_imm $C))
+
+;; Fold `(binop_imm $C1 (binop_imm $C2 $x))` into `(binop_imm $(binop $C2 $C1) $x)`.
+(=> (iadd_imm $C1 (iadd_imm $C2 $x)) (iadd_imm $(iadd $C1 $C2) $x))
+(=> (imul_imm $C1 (imul_imm $C2 $x)) (imul_imm $(imul $C1 $C2) $x))
+(=> (bor_imm $C1 (bor_imm $C2 $x)) (bor_imm $(bor $C1 $C2) $x))
+(=> (band_imm $C1 (band_imm $C2 $x)) (band_imm $(band $C1 $C2) $x))
+(=> (bxor_imm $C1 (bxor_imm $C2 $x)) (bxor_imm $(bxor $C1 $C2) $x))
+
+;; Remove operations that are no-ops.
+(=> (iadd_imm 0 $x) $x)
+(=> (imul_imm 1 $x) $x)
+(=> (sdiv_imm 1 $x) $x)
+(=> (udiv_imm 1 $x) $x)
+(=> (bor_imm 0 $x) $x)
+(=> (band_imm -1 $x) $x)
+(=> (bxor_imm 0 $x) $x)
+(=> (rotl_imm 0 $x) $x)
+(=> (rotr_imm 0 $x) $x)
+(=> (ishl_imm 0 $x) $x)
+(=> (ushr_imm 0 $x) $x)
+(=> (sshr_imm 0 $x) $x)
+
+;; Replace with zero.
+(=> (imul_imm 0 $x) 0)
+(=> (band_imm 0 $x) 0)
+
+;; Replace with negative 1.
+(=> (bor_imm -1 $x) -1)
+
+;; Transform `[(x << N) >> N]` into a (un)signed-extending move.
+;;
+;; i16 -> i8 -> i16
+(=> (when (ushr_imm 8 (ishl_imm 8 $x))
+      (bit-width $x 16))
+    (uextend{i16} (ireduce{i8} $x)))
+(=> (when (sshr_imm 8 (ishl_imm 8 $x))
+      (bit-width $x 16))
+    (sextend{i16} (ireduce{i8} $x)))
+;; i32 -> i8 -> i32
+(=> (when (ushr_imm 24 (ishl_imm 24 $x))
+      (bit-width $x 32))
+    (uextend{i32} (ireduce{i8} $x)))
+(=> (when (sshr_imm 24 (ishl_imm 24 $x))
+      (bit-width $x 32))
+    (sextend{i32} (ireduce{i8} $x)))
+;; i32 -> i16 -> i32
+(=> (when (ushr_imm 16 (ishl_imm 16 $x))
+      (bit-width $x 32))
+    (uextend{i32} (ireduce{i16} $x)))
+(=> (when (sshr_imm 16 (ishl_imm 16 $x))
+      (bit-width $x 32))
+    (sextend{i32} (ireduce{i16} $x)))
+;; i64 -> i8 -> i64
+(=> (when (ushr_imm 56 (ishl_imm 56 $x))
+      (bit-width $x 64))
+    (uextend{i64} (ireduce{i8} $x)))
+(=> (when (sshr_imm 56 (ishl_imm 56 $x))
+      (bit-width $x 64))
+    (sextend{i64} (ireduce{i8} $x)))
+;; i64 -> i16 -> i64
+(=> (when (ushr_imm 48 (ishl_imm 48 $x))
+      (bit-width $x 64))
+    (uextend{i64} (ireduce{i16} $x)))
+(=> (when (sshr_imm 48 (ishl_imm 48 $x))
+      (bit-width $x 64))
+    (sextend{i64} (ireduce{i16} $x)))
+;; i64 -> i32 -> i64
+(=> (when (ushr_imm 32 (ishl_imm 32 $x))
+      (bit-width $x 64))
+    (uextend{i64} (ireduce{i32} $x)))
+(=> (when (sshr_imm 32 (ishl_imm 32 $x))
+      (bit-width $x 64))
+    (sextend{i64} (ireduce{i32} $x)))
+
+;; Fold away redundant `bint` instructions that accept both integer and boolean
+;; arguments.
+(=> (select (bint $x) $y $z) (select $x $y $z))
+(=> (brz (bint $x)) (brz $x))
+(=> (brnz (bint $x)) (brnz $x))
+(=> (trapz (bint $x)) (trapz $x))
+(=> (trapnz (bint $x)) (trapnz $x))
+
+;; Fold comparisons into branch operations when possible.
+;;
+;; This matches against operations which compare against zero, then use the
+;; result in a `brz` or `brnz` branch. It folds those two operations into a
+;; single `brz` or `brnz`.
+(=> (brnz (icmp_imm ne 0 $x)) (brnz $x))
+(=> (brz (icmp_imm ne 0 $x)) (brz $x))
+(=> (brnz (icmp_imm eq 0 $x)) (brz $x))
+(=> (brz (icmp_imm eq 0 $x)) (brnz $x))
+
+;; Division and remainder by constants.
+;;
+;; TODO: this section is incomplete, and a bunch of related optimizations are
+;; still hand-coded in `simple_preopt.rs`.
+
+;; (Division by one is handled above.)
+
+;; Remainder by one is zero.
+(=> (urem_imm 1 $x) 0)
+(=> (srem_imm 1 $x) 0)
+
+;; Division by a power of two -> shift right.
+(=> (when (udiv_imm $C $x)
+          (is-power-of-two $C))
+    (ushr_imm $(log2 $C) $x))
--- a/third_party/rust/cranelift-codegen/src/preopt.serialized
+++ b/third_party/rust/cranelift-codegen/src/preopt.serialized
--- a/third_party/rust/cranelift-codegen/src/remove_constant_phis.rs
+++ b/third_party/rust/cranelift-codegen/src/remove_constant_phis.rs
@ -0,0 +1,393 @@
+//! A Constant-Phi-Node removal pass.
+
+use log::info;
+
+use crate::dominator_tree::DominatorTree;
+use crate::entity::EntityList;
+use crate::fx::FxHashMap;
+use crate::fx::FxHashSet;
+use crate::ir::instructions::BranchInfo;
+use crate::ir::Function;
+use crate::ir::{Block, Inst, Value};
+use crate::timing;
+
+use smallvec::{smallvec, SmallVec};
+use std::vec::Vec;
+
+// A note on notation.  For the sake of clarity, this file uses the phrase
+// "formal parameters" to mean the `Value`s listed in the block head, and
+// "actual parameters" to mean the `Value`s passed in a branch or a jump:
+//
+// block4(v16: i32, v18: i32):    <-- formal parameters
+//   ...
+//   brnz v27, block7(v22, v24)   <-- actual parameters
+//   jump block6
+
+// This transformation pass (conceptually) partitions all values in the
+// function into two groups:
+//
+// * Group A: values defined by block formal parameters, except for the entry block.
+//
+// * Group B: All other values: that is, values defined by instructions,
+//   and the formals of the entry block.
+//
+// For each value in Group A, it attempts to establish whether it will have
+// the value of exactly one member of Group B.  If so, the formal parameter is
+// deleted, all corresponding actual parameters (in jumps/branches to the
+// defining block) are deleted, and a rename is inserted.
+//
+// The entry block is special-cased because (1) we don't know what values flow
+// to its formals and (2) in any case we can't change its formals.
+//
+// Work proceeds in three phases.
+//
+// * Phase 1: examine all instructions.  For each block, make up a useful
+//   grab-bag of information, `BlockSummary`, that summarises the block's
+//   formals and jump/branch instruction.  This is used by Phases 2 and 3.
+//
+// * Phase 2: for each value in Group A, try to find a single Group B value
+//   that flows to it.  This is done using a classical iterative forward
+//   dataflow analysis over a simple constant-propagation style lattice.  It
+//   converges quickly in practice -- I have seen at most 4 iterations.  This
+//   is relatively cheap because the iteration is done over the
+//   `BlockSummary`s, and does not visit each instruction.  The resulting
+//   fixed point is stored in a `SolverState`.
+//
+// * Phase 3: using the `SolverState` and `BlockSummary`, edit the function to
+//   remove redundant formals and actuals, and to insert suitable renames.
+//
+// Note that the effectiveness of the analysis depends on on the fact that
+// there are no copy instructions in Cranelift's IR.  If there were, the
+// computation of `actual_absval` in Phase 2 would have to be extended to
+// chase through such copies.
+//
+// For large functions, the analysis cost using the new AArch64 backend is about
+// 0.6% of the non-optimising compile time, as measured by instruction counts.
+// This transformation usually pays for itself several times over, though, by
+// reducing the isel/regalloc cost downstream.  Gains of up to 7% have been
+// seen for large functions.
+
+// The `Value`s (Group B) that can flow to a formal parameter (Group A).
+#[derive(Clone, Copy, Debug, PartialEq)]
+enum AbstractValue {
+    // Two or more values flow to this formal.
+    Many,
+    // Exactly one value, as stated, flows to this formal.  The `Value`s that
+    // can appear here are exactly: `Value`s defined by `Inst`s, plus the
+    // `Value`s defined by the formals of the entry block.  Note that this is
+    // exactly the set of `Value`s that are *not* tracked in the solver below
+    // (see `SolverState`).
+    One(Value /*Group B*/),
+    // No value flows to this formal.
+    None,
+}
+
+impl AbstractValue {
+    fn join(self, other: AbstractValue) -> AbstractValue {
+        match (self, other) {
+            // Joining with `None` has no effect
+            (AbstractValue::None, p2) => p2,
+            (p1, AbstractValue::None) => p1,
+            // Joining with `Many` produces `Many`
+            (AbstractValue::Many, _p2) => AbstractValue::Many,
+            (_p1, AbstractValue::Many) => AbstractValue::Many,
+            // The only interesting case
+            (AbstractValue::One(v1), AbstractValue::One(v2)) => {
+                if v1 == v2 {
+                    AbstractValue::One(v1)
+                } else {
+                    AbstractValue::Many
+                }
+            }
+        }
+    }
+    fn is_one(self) -> bool {
+        if let AbstractValue::One(_) = self {
+            true
+        } else {
+            false
+        }
+    }
+}
+
+// For some block, a useful bundle of info.  The `Block` itself is not stored
+// here since it will be the key in the associated `FxHashMap` -- see
+// `summaries` below.  For the `SmallVec` tuning params: most blocks have
+// few parameters, hence `4`.  And almost all blocks have either one or two
+// successors, hence `2`.
+#[derive(Debug)]
+struct BlockSummary {
+    // Formal parameters for this `Block`
+    formals: SmallVec<[Value; 4] /*Group A*/>,
+    // For each `Inst` in this block that transfers to another block: the
+    // `Inst` itself, the destination `Block`, and the actual parameters
+    // passed.  We don't bother to include transfers that pass zero parameters
+    // since that makes more work for the solver for no purpose.
+    dests: SmallVec<[(Inst, Block, SmallVec<[Value; 4] /*both Groups A and B*/>); 2]>,
+}
+impl BlockSummary {
+    fn new(formals: SmallVec<[Value; 4]>) -> Self {
+        Self {
+            formals,
+            dests: smallvec![],
+        }
+    }
+}
+
+// Solver state.  This holds a AbstractValue for each formal parameter, except
+// for those from the entry block.
+struct SolverState {
+    absvals: FxHashMap<Value /*Group A*/, AbstractValue>,
+}
+impl SolverState {
+    fn new() -> Self {
+        Self {
+            absvals: FxHashMap::default(),
+        }
+    }
+    fn get(&self, actual: Value) -> AbstractValue {
+        match self.absvals.get(&actual) {
+            Some(lp) => *lp,
+            None => panic!("SolverState::get: formal param {:?} is untracked?!", actual),
+        }
+    }
+    fn maybe_get(&self, actual: Value) -> Option<&AbstractValue> {
+        self.absvals.get(&actual)
+    }
+    fn set(&mut self, actual: Value, lp: AbstractValue) {
+        match self.absvals.insert(actual, lp) {
+            Some(_old_lp) => {}
+            None => panic!("SolverState::set: formal param {:?} is untracked?!", actual),
+        }
+    }
+}
+
+/// Detect phis in `func` that will only ever produce one value, using a
+/// classic forward dataflow analysis.  Then remove them.
+#[inline(never)]
+pub fn do_remove_constant_phis(func: &mut Function, domtree: &mut DominatorTree) {
+    let _tt = timing::remove_constant_phis();
+    debug_assert!(domtree.is_valid());
+
+    // Get the blocks, in reverse postorder
+    let mut blocks_reverse_postorder = Vec::<Block>::new();
+    for block in domtree.cfg_postorder() {
+        blocks_reverse_postorder.push(*block);
+    }
+    blocks_reverse_postorder.reverse();
+
+    // Phase 1 of 3: for each block, make a summary containing all relevant
+    // info.  The solver will iterate over the summaries, rather than having
+    // to inspect each instruction in each block.
+    let mut summaries = FxHashMap::<Block, BlockSummary>::default();
+
+    for b in &blocks_reverse_postorder {
+        let formals = func.dfg.block_params(*b);
+        let mut summary = BlockSummary::new(SmallVec::from(formals));
+
+        for inst in func.layout.block_insts(*b) {
+            let idetails = &func.dfg[inst];
+            // Note that multi-dest transfers (i.e., branch tables) don't
+            // carry parameters in our IR, so we only have to care about
+            // `SingleDest` here.
+            if let BranchInfo::SingleDest(dest, _) = idetails.analyze_branch(&func.dfg.value_lists)
+            {
+                let inst_var_args = func.dfg.inst_variable_args(inst);
+                // Skip branches/jumps that carry no params.
+                if inst_var_args.len() > 0 {
+                    let mut actuals = SmallVec::<[Value; 4]>::new();
+                    for arg in inst_var_args {
+                        let arg = func.dfg.resolve_aliases(*arg);
+                        actuals.push(arg);
+                    }
+                    summary.dests.push((inst, dest, actuals));
+                }
+            }
+        }
+
+        // Ensure the invariant that all blocks (except for the entry) appear
+        // in the summary, *unless* they have neither formals nor any
+        // param-carrying branches/jumps.
+        if formals.len() > 0 || summary.dests.len() > 0 {
+            summaries.insert(*b, summary);
+        }
+    }
+
+    // Phase 2 of 3: iterate over the summaries in reverse postorder,
+    // computing new `AbstractValue`s for each tracked `Value`.  The set of
+    // tracked `Value`s is exactly Group A as described above.
+
+    let entry_block = func
+        .layout
+        .entry_block()
+        .expect("remove_constant_phis: entry block unknown");
+
+    // Set up initial solver state
+    let mut state = SolverState::new();
+
+    for b in &blocks_reverse_postorder {
+        // For each block, get the formals
+        if *b == entry_block {
+            continue;
+        }
+        let formals: &[Value] = func.dfg.block_params(*b);
+        for formal in formals {
+            let mb_old_absval = state.absvals.insert(*formal, AbstractValue::None);
+            assert!(mb_old_absval.is_none());
+        }
+    }
+
+    // Solve: repeatedly traverse the blocks in reverse postorder, until there
+    // are no changes.
+    let mut iter_no = 0;
+    loop {
+        iter_no += 1;
+        let mut changed = false;
+
+        for src in &blocks_reverse_postorder {
+            let mb_src_summary = summaries.get(src);
+            // The src block might have no summary.  This means it has no
+            // branches/jumps that carry parameters *and* it doesn't take any
+            // parameters itself.  Phase 1 ensures this.  So we can ignore it.
+            if mb_src_summary.is_none() {
+                continue;
+            }
+            let src_summary = mb_src_summary.unwrap();
+            for (_inst, dst, src_actuals) in &src_summary.dests {
+                assert!(*dst != entry_block);
+                // By contrast, the dst block must have a summary.  Phase 1
+                // will have only included an entry in `src_summary.dests` if
+                // that branch/jump carried at least one parameter.  So the
+                // dst block does take parameters, so it must have a summary.
+                let dst_summary = summaries
+                    .get(dst)
+                    .expect("remove_constant_phis: dst block has no summary");
+                let dst_formals = &dst_summary.formals;
+                assert!(src_actuals.len() == dst_formals.len());
+                for (formal, actual) in dst_formals.iter().zip(src_actuals.iter()) {
+                    // Find the abstract value for `actual`.  If it is a block
+                    // formal parameter then the most recent abstract value is
+                    // to be found in the solver state.  If not, then it's a
+                    // real value defining point (not a phi), in which case
+                    // return it itself.
+                    let actual_absval = match state.maybe_get(*actual) {
+                        Some(pt) => *pt,
+                        None => AbstractValue::One(*actual),
+                    };
+
+                    // And `join` the new value with the old.
+                    let formal_absval_old = state.get(*formal);
+                    let formal_absval_new = formal_absval_old.join(actual_absval);
+                    if formal_absval_new != formal_absval_old {
+                        changed = true;
+                        state.set(*formal, formal_absval_new);
+                    }
+                }
+            }
+        }
+
+        if !changed {
+            break;
+        }
+    }
+    let mut n_consts = 0;
+    for absval in state.absvals.values() {
+        if absval.is_one() {
+            n_consts += 1;
+        }
+    }
+
+    // Phase 3 of 3: edit the function to remove constant formals, using the
+    // summaries and the final solver state as a guide.
+
+    // Make up a set of blocks that need editing.
+    let mut need_editing = FxHashSet::<Block>::default();
+    for (block, summary) in &summaries {
+        if *block == entry_block {
+            continue;
+        }
+        for formal in &summary.formals {
+            let formal_absval = state.get(*formal);
+            if formal_absval.is_one() {
+                need_editing.insert(*block);
+                break;
+            }
+        }
+    }
+
+    // Firstly, deal with the formals.  For each formal which is redundant,
+    // remove it, and also add a reroute from it to the constant value which
+    // it we know it to be.
+    for b in &need_editing {
+        let mut del_these = SmallVec::<[(Value, Value); 32]>::new();
+        let formals: &[Value] = func.dfg.block_params(*b);
+        for formal in formals {
+            // The state must give an absval for `formal`.
+            if let AbstractValue::One(replacement_val) = state.get(*formal) {
+                del_these.push((*formal, replacement_val));
+            }
+        }
+        // We can delete the formals in any order.  However,
+        // `remove_block_param` works by sliding backwards all arguments to
+        // the right of the it is asked to delete.  Hence when removing more
+        // than one formal, it is significantly more efficient to ask it to
+        // remove the rightmost formal first, and hence this `reverse`.
+        del_these.reverse();
+        for (redundant_formal, replacement_val) in del_these {
+            func.dfg.remove_block_param(redundant_formal);
+            func.dfg.change_to_alias(redundant_formal, replacement_val);
+        }
+    }
+
+    // Secondly, visit all branch insns.  If the destination has had its
+    // formals changed, change the actuals accordingly.  Don't scan all insns,
+    // rather just visit those as listed in the summaries we prepared earlier.
+    for (_src_block, summary) in &summaries {
+        for (inst, dst_block, _src_actuals) in &summary.dests {
+            if !need_editing.contains(dst_block) {
+                continue;
+            }
+
+            let old_actuals = func.dfg[*inst].take_value_list().unwrap();
+            let num_old_actuals = old_actuals.len(&func.dfg.value_lists);
+            let num_fixed_actuals = func.dfg[*inst]
+                .opcode()
+                .constraints()
+                .num_fixed_value_arguments();
+            let dst_summary = summaries.get(&dst_block).unwrap();
+
+            // Check that the numbers of arguments make sense.
+            assert!(num_fixed_actuals <= num_old_actuals);
+            assert!(num_fixed_actuals + dst_summary.formals.len() == num_old_actuals);
+
+            // Create a new value list.
+            let mut new_actuals = EntityList::<Value>::new();
+            // Copy the fixed args to the new list
+            for i in 0..num_fixed_actuals {
+                let val = old_actuals.get(i, &func.dfg.value_lists).unwrap();
+                new_actuals.push(val, &mut func.dfg.value_lists);
+            }
+
+            // Copy the variable args (the actual block params) to the new
+            // list, filtering out redundant ones.
+            for i in 0..dst_summary.formals.len() {
+                let actual_i = old_actuals
+                    .get(num_fixed_actuals + i, &func.dfg.value_lists)
+                    .unwrap();
+                let formal_i = dst_summary.formals[i];
+                let is_redundant = state.get(formal_i).is_one();
+                if !is_redundant {
+                    new_actuals.push(actual_i, &mut func.dfg.value_lists);
+                }
+            }
+            func.dfg[*inst].put_value_list(new_actuals);
+        }
+    }
+
+    info!(
+        "do_remove_constant_phis: done, {} iters.   {} formals, of which {} const.",
+        iter_no,
+        state.absvals.len(),
+        n_consts
+    );
+}
--- a/third_party/rust/cranelift-codegen/src/simple_preopt.rs
+++ b/third_party/rust/cranelift-codegen/src/simple_preopt.rs
@ -10,10 +10,8 @@ use crate::divconst_magic_numbers::{MS32, MS64, MU32, MU64};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::{
    condcodes::{CondCode, IntCC},
-    dfg::ValueDef,
-    immediates,
-    instructions::{Opcode, ValueList},
-    types::{I16, I32, I64, I8},
+    instructions::Opcode,
+    types::{I32, I64},
    Block, DataFlowGraph, Function, Inst, InstBuilder, InstructionData, Type, Value,
 };
 use crate::isa::TargetIsa;
@ -144,7 +142,7 @@ fn package_up_divrem_info(
 /// Examine `inst` to see if it is a div or rem by a constant, and if so return the operands,
 /// signedness, operation size and div-vs-rem-ness in a handy bundle.
 fn get_div_info(inst: Inst, dfg: &DataFlowGraph) -> Option<DivRemByConstInfo> {
-    if let InstructionData::BinaryImm { opcode, arg, imm } = dfg[inst] {
+    if let InstructionData::BinaryImm64 { opcode, arg, imm } = dfg[inst] {
        let (is_signed, is_rem) = match opcode {
            Opcode::UdivImm => (false, false),
            Opcode::UremImm => (false, true),
@ -468,340 +466,6 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso
    }
 }

-#[inline]
-fn resolve_imm64_value(dfg: &DataFlowGraph, value: Value) -> Option<immediates::Imm64> {
-    if let ValueDef::Result(candidate_inst, _) = dfg.value_def(value) {
-        if let InstructionData::UnaryImm {
-            opcode: Opcode::Iconst,
-            imm,
-        } = dfg[candidate_inst]
-        {
-            return Some(imm);
-        }
-    }
-    None
-}
-
-/// Try to transform [(x << N) >> N] into a (un)signed-extending move.
-/// Returns true if the final instruction has been converted to such a move.
-fn try_fold_extended_move(
-    pos: &mut FuncCursor,
-    inst: Inst,
-    opcode: Opcode,
-    arg: Value,
-    imm: immediates::Imm64,
-) -> bool {
-    if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
-        if let InstructionData::BinaryImm {
-            opcode: Opcode::IshlImm,
-            arg: prev_arg,
-            imm: prev_imm,
-        } = &pos.func.dfg[arg_inst]
-        {
-            if imm != *prev_imm {
-                return false;
-            }
-
-            let dest_ty = pos.func.dfg.ctrl_typevar(inst);
-            if dest_ty != pos.func.dfg.ctrl_typevar(arg_inst) || !dest_ty.is_int() {
-                return false;
-            }
-
-            let imm_bits: i64 = imm.into();
-            let ireduce_ty = match (dest_ty.lane_bits() as i64).wrapping_sub(imm_bits) {
-                8 => I8,
-                16 => I16,
-                32 => I32,
-                _ => return false,
-            };
-            let ireduce_ty = ireduce_ty.by(dest_ty.lane_count()).unwrap();
-
-            // This becomes a no-op, since ireduce_ty has a smaller lane width than
-            // the argument type (also the destination type).
-            let arg = *prev_arg;
-            let narrower_arg = pos.ins().ireduce(ireduce_ty, arg);
-
-            if opcode == Opcode::UshrImm {
-                pos.func.dfg.replace(inst).uextend(dest_ty, narrower_arg);
-            } else {
-                pos.func.dfg.replace(inst).sextend(dest_ty, narrower_arg);
-            }
-            return true;
-        }
-    }
-    false
-}
-
-/// Apply basic simplifications.
-///
-/// This folds constants with arithmetic to form `_imm` instructions, and other minor
-/// simplifications.
-///
-/// Doesn't apply some simplifications if the native word width (in bytes) is smaller than the
-/// controlling type's width of the instruction. This would result in an illegal instruction that
-/// would likely be expanded back into an instruction on smaller types with the same initial
-/// opcode, creating unnecessary churn.
-fn simplify(pos: &mut FuncCursor, inst: Inst, native_word_width: u32) {
-    match pos.func.dfg[inst] {
-        InstructionData::Binary { opcode, args } => {
-            if let Some(mut imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
-                let new_opcode = match opcode {
-                    Opcode::Iadd => Opcode::IaddImm,
-                    Opcode::Imul => Opcode::ImulImm,
-                    Opcode::Sdiv => Opcode::SdivImm,
-                    Opcode::Udiv => Opcode::UdivImm,
-                    Opcode::Srem => Opcode::SremImm,
-                    Opcode::Urem => Opcode::UremImm,
-                    Opcode::Band => Opcode::BandImm,
-                    Opcode::Bor => Opcode::BorImm,
-                    Opcode::Bxor => Opcode::BxorImm,
-                    Opcode::Rotl => Opcode::RotlImm,
-                    Opcode::Rotr => Opcode::RotrImm,
-                    Opcode::Ishl => Opcode::IshlImm,
-                    Opcode::Ushr => Opcode::UshrImm,
-                    Opcode::Sshr => Opcode::SshrImm,
-                    Opcode::Isub => {
-                        imm = imm.wrapping_neg();
-                        Opcode::IaddImm
-                    }
-                    Opcode::Ifcmp => Opcode::IfcmpImm,
-                    _ => return,
-                };
-                let ty = pos.func.dfg.ctrl_typevar(inst);
-                if ty.bytes() <= native_word_width {
-                    pos.func
-                        .dfg
-                        .replace(inst)
-                        .BinaryImm(new_opcode, ty, imm, args[0]);
-
-                    // Repeat for BinaryImm simplification.
-                    simplify(pos, inst, native_word_width);
-                }
-            } else if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[0]) {
-                let new_opcode = match opcode {
-                    Opcode::Iadd => Opcode::IaddImm,
-                    Opcode::Imul => Opcode::ImulImm,
-                    Opcode::Band => Opcode::BandImm,
-                    Opcode::Bor => Opcode::BorImm,
-                    Opcode::Bxor => Opcode::BxorImm,
-                    Opcode::Isub => Opcode::IrsubImm,
-                    _ => return,
-                };
-                let ty = pos.func.dfg.ctrl_typevar(inst);
-                if ty.bytes() <= native_word_width {
-                    pos.func
-                        .dfg
-                        .replace(inst)
-                        .BinaryImm(new_opcode, ty, imm, args[1]);
-                }
-            }
-        }
-
-        InstructionData::Unary { opcode, arg } => {
-            if let Opcode::AdjustSpDown = opcode {
-                if let Some(imm) = resolve_imm64_value(&pos.func.dfg, arg) {
-                    // Note this works for both positive and negative immediate values.
-                    pos.func.dfg.replace(inst).adjust_sp_down_imm(imm);
-                }
-            }
-        }
-
-        InstructionData::BinaryImm { opcode, arg, imm } => {
-            let ty = pos.func.dfg.ctrl_typevar(inst);
-
-            let mut arg = arg;
-            let mut imm = imm;
-            match opcode {
-                Opcode::IaddImm
-                | Opcode::ImulImm
-                | Opcode::BorImm
-                | Opcode::BandImm
-                | Opcode::BxorImm => {
-                    // Fold binary_op(C2, binary_op(C1, x)) into binary_op(binary_op(C1, C2), x)
-                    if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
-                        if let InstructionData::BinaryImm {
-                            opcode: prev_opcode,
-                            arg: prev_arg,
-                            imm: prev_imm,
-                        } = &pos.func.dfg[arg_inst]
-                        {
-                            if opcode == *prev_opcode && ty == pos.func.dfg.ctrl_typevar(arg_inst) {
-                                let lhs: i64 = imm.into();
-                                let rhs: i64 = (*prev_imm).into();
-                                let new_imm = match opcode {
-                                    Opcode::BorImm => lhs | rhs,
-                                    Opcode::BandImm => lhs & rhs,
-                                    Opcode::BxorImm => lhs ^ rhs,
-                                    Opcode::IaddImm => lhs.wrapping_add(rhs),
-                                    Opcode::ImulImm => lhs.wrapping_mul(rhs),
-                                    _ => panic!("can't happen"),
-                                };
-                                let new_imm = immediates::Imm64::from(new_imm);
-                                let new_arg = *prev_arg;
-                                pos.func
-                                    .dfg
-                                    .replace(inst)
-                                    .BinaryImm(opcode, ty, new_imm, new_arg);
-                                imm = new_imm;
-                                arg = new_arg;
-                            }
-                        }
-                    }
-                }
-
-                Opcode::UshrImm | Opcode::SshrImm => {
-                    if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width
-                        && try_fold_extended_move(pos, inst, opcode, arg, imm)
-                    {
-                        return;
-                    }
-                }
-
-                _ => {}
-            };
-
-            // Replace operations that are no-ops.
-            match (opcode, imm.into()) {
-                (Opcode::IaddImm, 0)
-                | (Opcode::ImulImm, 1)
-                | (Opcode::SdivImm, 1)
-                | (Opcode::UdivImm, 1)
-                | (Opcode::BorImm, 0)
-                | (Opcode::BandImm, -1)
-                | (Opcode::BxorImm, 0)
-                | (Opcode::RotlImm, 0)
-                | (Opcode::RotrImm, 0)
-                | (Opcode::IshlImm, 0)
-                | (Opcode::UshrImm, 0)
-                | (Opcode::SshrImm, 0) => {
-                    // Alias the result value with the original argument.
-                    replace_single_result_with_alias(&mut pos.func.dfg, inst, arg);
-                }
-                (Opcode::ImulImm, 0) | (Opcode::BandImm, 0) => {
-                    // Replace by zero.
-                    pos.func.dfg.replace(inst).iconst(ty, 0);
-                }
-                (Opcode::BorImm, -1) => {
-                    // Replace by minus one.
-                    pos.func.dfg.replace(inst).iconst(ty, -1);
-                }
-                _ => {}
-            }
-        }
-
-        InstructionData::IntCompare { opcode, cond, args } => {
-            debug_assert_eq!(opcode, Opcode::Icmp);
-            if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
-                if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width {
-                    pos.func.dfg.replace(inst).icmp_imm(cond, args[0], imm);
-                }
-            }
-        }
-
-        InstructionData::CondTrap { .. }
-        | InstructionData::Branch { .. }
-        | InstructionData::Ternary {
-            opcode: Opcode::Select,
-            ..
-        } => {
-            // Fold away a redundant `bint`.
-            let condition_def = {
-                let args = pos.func.dfg.inst_args(inst);
-                pos.func.dfg.value_def(args[0])
-            };
-            if let ValueDef::Result(def_inst, _) = condition_def {
-                if let InstructionData::Unary {
-                    opcode: Opcode::Bint,
-                    arg: bool_val,
-                } = pos.func.dfg[def_inst]
-                {
-                    let args = pos.func.dfg.inst_args_mut(inst);
-                    args[0] = bool_val;
-                }
-            }
-        }
-
-        _ => {}
-    }
-}
-
-struct BranchOptInfo {
-    br_inst: Inst,
-    cmp_arg: Value,
-    args: ValueList,
-    new_opcode: Opcode,
-}
-
-/// Fold comparisons into branch operations when possible.
-///
-/// This matches against operations which compare against zero, then use the
-/// result in a `brz` or `brnz` branch. It folds those two operations into a
-/// single `brz` or `brnz`.
-fn branch_opt(pos: &mut FuncCursor, inst: Inst) {
-    let mut info = if let InstructionData::Branch {
-        opcode: br_opcode,
-        args: ref br_args,
-        ..
-    } = pos.func.dfg[inst]
-    {
-        let first_arg = {
-            let args = pos.func.dfg.inst_args(inst);
-            args[0]
-        };
-
-        let icmp_inst = if let ValueDef::Result(icmp_inst, _) = pos.func.dfg.value_def(first_arg) {
-            icmp_inst
-        } else {
-            return;
-        };
-
-        if let InstructionData::IntCompareImm {
-            opcode: Opcode::IcmpImm,
-            arg: cmp_arg,
-            cond: cmp_cond,
-            imm: cmp_imm,
-        } = pos.func.dfg[icmp_inst]
-        {
-            let cmp_imm: i64 = cmp_imm.into();
-            if cmp_imm != 0 {
-                return;
-            }
-
-            // icmp_imm returns non-zero when the comparison is true. So, if
-            // we're branching on zero, we need to invert the condition.
-            let cond = match br_opcode {
-                Opcode::Brz => cmp_cond.inverse(),
-                Opcode::Brnz => cmp_cond,
-                _ => return,
-            };
-
-            let new_opcode = match cond {
-                IntCC::Equal => Opcode::Brz,
-                IntCC::NotEqual => Opcode::Brnz,
-                _ => return,
-            };
-
-            BranchOptInfo {
-                br_inst: inst,
-                cmp_arg,
-                args: br_args.clone(),
-                new_opcode,
-            }
-        } else {
-            return;
-        }
-    } else {
-        return;
-    };
-
-    info.args.as_mut_slice(&mut pos.func.dfg.value_lists)[0] = info.cmp_arg;
-    if let InstructionData::Branch { ref mut opcode, .. } = pos.func.dfg[info.br_inst] {
-        *opcode = info.new_opcode;
-    } else {
-        panic!();
-    }
-}
-
 enum BranchOrderKind {
    BrzToBrnz(Value),
    BrnzToBrz(Value),
@ -944,15 +608,490 @@ fn branch_order(pos: &mut FuncCursor, cfg: &mut ControlFlowGraph, block: Block,
    cfg.recompute_block(pos.func, block);
 }

+#[cfg(feature = "enable-peepmatic")]
+mod simplify {
+    use super::*;
+    use crate::peepmatic::ValueOrInst;
+
+    pub type PeepholeOptimizer<'a, 'b> =
+        peepmatic_runtime::optimizer::PeepholeOptimizer<'static, 'a, &'b dyn TargetIsa>;
+
+    pub fn peephole_optimizer<'a, 'b>(isa: &'b dyn TargetIsa) -> PeepholeOptimizer<'a, 'b> {
+        crate::peepmatic::preopt(isa)
+    }
+
+    pub fn apply_all<'a, 'b>(
+        optimizer: &mut PeepholeOptimizer<'a, 'b>,
+        pos: &mut FuncCursor<'a>,
+        inst: Inst,
+        _native_word_width: u32,
+    ) {
+        // After we apply one optimization, that might make another
+        // optimization applicable. Keep running the peephole optimizer
+        // until either:
+        //
+        // * No optimization applied, and therefore it doesn't make sense to
+        //   try again, because no optimization will apply again.
+        //
+        // * Or when we replaced an instruction with an alias to an existing
+        //   value, because we already ran the peephole optimizer over the
+        //   aliased value's instruction in an early part of the traversal
+        //   over the function.
+        while let Some(ValueOrInst::Inst(new_inst)) =
+            optimizer.apply_one(pos, ValueOrInst::Inst(inst))
+        {
+            // We transplanted a new instruction into the current
+            // instruction, so the "new" instruction is actually the same
+            // one, just with different data.
+            debug_assert_eq!(new_inst, inst);
+        }
+        debug_assert_eq!(pos.current_inst(), Some(inst));
+    }
+}
+
+#[cfg(not(feature = "enable-peepmatic"))]
+mod simplify {
+    use super::*;
+    use crate::ir::{
+        dfg::ValueDef,
+        immediates,
+        instructions::{Opcode, ValueList},
+        types::{B8, I16, I32, I8},
+    };
+    use std::marker::PhantomData;
+
+    pub struct PeepholeOptimizer<'a, 'b> {
+        phantom: PhantomData<(&'a (), &'b ())>,
+    }
+
+    pub fn peephole_optimizer<'a, 'b>(_: &dyn TargetIsa) -> PeepholeOptimizer<'a, 'b> {
+        PeepholeOptimizer {
+            phantom: PhantomData,
+        }
+    }
+
+    pub fn apply_all<'a, 'b>(
+        _optimizer: &mut PeepholeOptimizer<'a, 'b>,
+        pos: &mut FuncCursor<'a>,
+        inst: Inst,
+        native_word_width: u32,
+    ) {
+        simplify(pos, inst, native_word_width);
+        branch_opt(pos, inst);
+    }
+
+    #[inline]
+    fn resolve_imm64_value(dfg: &DataFlowGraph, value: Value) -> Option<immediates::Imm64> {
+        if let ValueDef::Result(candidate_inst, _) = dfg.value_def(value) {
+            if let InstructionData::UnaryImm {
+                opcode: Opcode::Iconst,
+                imm,
+            } = dfg[candidate_inst]
+            {
+                return Some(imm);
+            }
+        }
+        None
+    }
+
+    /// Try to transform [(x << N) >> N] into a (un)signed-extending move.
+    /// Returns true if the final instruction has been converted to such a move.
+    fn try_fold_extended_move(
+        pos: &mut FuncCursor,
+        inst: Inst,
+        opcode: Opcode,
+        arg: Value,
+        imm: immediates::Imm64,
+    ) -> bool {
+        if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
+            if let InstructionData::BinaryImm64 {
+                opcode: Opcode::IshlImm,
+                arg: prev_arg,
+                imm: prev_imm,
+            } = &pos.func.dfg[arg_inst]
+            {
+                if imm != *prev_imm {
+                    return false;
+                }
+
+                let dest_ty = pos.func.dfg.ctrl_typevar(inst);
+                if dest_ty != pos.func.dfg.ctrl_typevar(arg_inst) || !dest_ty.is_int() {
+                    return false;
+                }
+
+                let imm_bits: i64 = imm.into();
+                let ireduce_ty = match (dest_ty.lane_bits() as i64).wrapping_sub(imm_bits) {
+                    8 => I8,
+                    16 => I16,
+                    32 => I32,
+                    _ => return false,
+                };
+                let ireduce_ty = ireduce_ty.by(dest_ty.lane_count()).unwrap();
+
+                // This becomes a no-op, since ireduce_ty has a smaller lane width than
+                // the argument type (also the destination type).
+                let arg = *prev_arg;
+                let narrower_arg = pos.ins().ireduce(ireduce_ty, arg);
+
+                if opcode == Opcode::UshrImm {
+                    pos.func.dfg.replace(inst).uextend(dest_ty, narrower_arg);
+                } else {
+                    pos.func.dfg.replace(inst).sextend(dest_ty, narrower_arg);
+                }
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Apply basic simplifications.
+    ///
+    /// This folds constants with arithmetic to form `_imm` instructions, and other minor
+    /// simplifications.
+    ///
+    /// Doesn't apply some simplifications if the native word width (in bytes) is smaller than the
+    /// controlling type's width of the instruction. This would result in an illegal instruction that
+    /// would likely be expanded back into an instruction on smaller types with the same initial
+    /// opcode, creating unnecessary churn.
+    fn simplify(pos: &mut FuncCursor, inst: Inst, native_word_width: u32) {
+        match pos.func.dfg[inst] {
+            InstructionData::Binary { opcode, args } => {
+                if let Some(mut imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
+                    let new_opcode = match opcode {
+                        Opcode::Iadd => Opcode::IaddImm,
+                        Opcode::Imul => Opcode::ImulImm,
+                        Opcode::Sdiv => Opcode::SdivImm,
+                        Opcode::Udiv => Opcode::UdivImm,
+                        Opcode::Srem => Opcode::SremImm,
+                        Opcode::Urem => Opcode::UremImm,
+                        Opcode::Band => Opcode::BandImm,
+                        Opcode::Bor => Opcode::BorImm,
+                        Opcode::Bxor => Opcode::BxorImm,
+                        Opcode::Rotl => Opcode::RotlImm,
+                        Opcode::Rotr => Opcode::RotrImm,
+                        Opcode::Ishl => Opcode::IshlImm,
+                        Opcode::Ushr => Opcode::UshrImm,
+                        Opcode::Sshr => Opcode::SshrImm,
+                        Opcode::Isub => {
+                            imm = imm.wrapping_neg();
+                            Opcode::IaddImm
+                        }
+                        Opcode::Ifcmp => Opcode::IfcmpImm,
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    if ty.bytes() <= native_word_width {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .BinaryImm64(new_opcode, ty, imm, args[0]);
+
+                        // Repeat for BinaryImm simplification.
+                        simplify(pos, inst, native_word_width);
+                    }
+                } else if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[0]) {
+                    let new_opcode = match opcode {
+                        Opcode::Iadd => Opcode::IaddImm,
+                        Opcode::Imul => Opcode::ImulImm,
+                        Opcode::Band => Opcode::BandImm,
+                        Opcode::Bor => Opcode::BorImm,
+                        Opcode::Bxor => Opcode::BxorImm,
+                        Opcode::Isub => Opcode::IrsubImm,
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    if ty.bytes() <= native_word_width {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .BinaryImm64(new_opcode, ty, imm, args[1]);
+                    }
+                }
+            }
+
+            InstructionData::Unary { opcode, arg } => {
+                if let Opcode::AdjustSpDown = opcode {
+                    if let Some(imm) = resolve_imm64_value(&pos.func.dfg, arg) {
+                        // Note this works for both positive and negative immediate values.
+                        pos.func.dfg.replace(inst).adjust_sp_down_imm(imm);
+                    }
+                }
+            }
+
+            InstructionData::BinaryImm64 { opcode, arg, imm } => {
+                let ty = pos.func.dfg.ctrl_typevar(inst);
+
+                let mut arg = arg;
+                let mut imm = imm;
+                match opcode {
+                    Opcode::IaddImm
+                    | Opcode::ImulImm
+                    | Opcode::BorImm
+                    | Opcode::BandImm
+                    | Opcode::BxorImm => {
+                        // Fold binary_op(C2, binary_op(C1, x)) into binary_op(binary_op(C1, C2), x)
+                        if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
+                            if let InstructionData::BinaryImm64 {
+                                opcode: prev_opcode,
+                                arg: prev_arg,
+                                imm: prev_imm,
+                            } = &pos.func.dfg[arg_inst]
+                            {
+                                if opcode == *prev_opcode
+                                    && ty == pos.func.dfg.ctrl_typevar(arg_inst)
+                                {
+                                    let lhs: i64 = imm.into();
+                                    let rhs: i64 = (*prev_imm).into();
+                                    let new_imm = match opcode {
+                                        Opcode::BorImm => lhs | rhs,
+                                        Opcode::BandImm => lhs & rhs,
+                                        Opcode::BxorImm => lhs ^ rhs,
+                                        Opcode::IaddImm => lhs.wrapping_add(rhs),
+                                        Opcode::ImulImm => lhs.wrapping_mul(rhs),
+                                        _ => panic!("can't happen"),
+                                    };
+                                    let new_imm = immediates::Imm64::from(new_imm);
+                                    let new_arg = *prev_arg;
+                                    pos.func
+                                        .dfg
+                                        .replace(inst)
+                                        .BinaryImm64(opcode, ty, new_imm, new_arg);
+                                    imm = new_imm;
+                                    arg = new_arg;
+                                }
+                            }
+                        }
+                    }
+
+                    Opcode::UshrImm | Opcode::SshrImm => {
+                        if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width
+                            && try_fold_extended_move(pos, inst, opcode, arg, imm)
+                        {
+                            return;
+                        }
+                    }
+
+                    _ => {}
+                };
+
+                // Replace operations that are no-ops.
+                match (opcode, imm.into()) {
+                    (Opcode::IaddImm, 0)
+                    | (Opcode::ImulImm, 1)
+                    | (Opcode::SdivImm, 1)
+                    | (Opcode::UdivImm, 1)
+                    | (Opcode::BorImm, 0)
+                    | (Opcode::BandImm, -1)
+                    | (Opcode::BxorImm, 0)
+                    | (Opcode::RotlImm, 0)
+                    | (Opcode::RotrImm, 0)
+                    | (Opcode::IshlImm, 0)
+                    | (Opcode::UshrImm, 0)
+                    | (Opcode::SshrImm, 0) => {
+                        // Alias the result value with the original argument.
+                        replace_single_result_with_alias(&mut pos.func.dfg, inst, arg);
+                    }
+                    (Opcode::ImulImm, 0) | (Opcode::BandImm, 0) => {
+                        // Replace by zero.
+                        pos.func.dfg.replace(inst).iconst(ty, 0);
+                    }
+                    (Opcode::BorImm, -1) => {
+                        // Replace by minus one.
+                        pos.func.dfg.replace(inst).iconst(ty, -1);
+                    }
+                    _ => {}
+                }
+            }
+
+            InstructionData::IntCompare { opcode, cond, args } => {
+                debug_assert_eq!(opcode, Opcode::Icmp);
+                if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
+                    if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width {
+                        pos.func.dfg.replace(inst).icmp_imm(cond, args[0], imm);
+                    }
+                }
+            }
+
+            InstructionData::CondTrap { .. }
+            | InstructionData::Branch { .. }
+            | InstructionData::Ternary {
+                opcode: Opcode::Select,
+                ..
+            } => {
+                // Fold away a redundant `bint`.
+                let condition_def = {
+                    let args = pos.func.dfg.inst_args(inst);
+                    pos.func.dfg.value_def(args[0])
+                };
+                if let ValueDef::Result(def_inst, _) = condition_def {
+                    if let InstructionData::Unary {
+                        opcode: Opcode::Bint,
+                        arg: bool_val,
+                    } = pos.func.dfg[def_inst]
+                    {
+                        let args = pos.func.dfg.inst_args_mut(inst);
+                        args[0] = bool_val;
+                    }
+                }
+            }
+
+            InstructionData::Ternary {
+                opcode: Opcode::Bitselect,
+                args,
+            } => {
+                let old_cond_type = pos.func.dfg.value_type(args[0]);
+                if !old_cond_type.is_vector() {
+                    return;
+                }
+
+                // Replace bitselect with vselect if each lane of controlling mask is either
+                // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
+                // while vselect can be encoded using single BLEND instruction.
+                if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
+                    let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
+                        InstructionData::Unary {
+                            opcode: Opcode::RawBitcast,
+                            arg,
+                        } => {
+                            // If controlling mask is raw-bitcasted boolean vector then
+                            // we know each lane is either all zeroes or ones,
+                            // so we can use vselect instruction instead.
+                            let arg_type = pos.func.dfg.value_type(arg);
+                            if !arg_type.is_vector() || !arg_type.lane_type().is_bool() {
+                                return;
+                            }
+                            (arg, arg_type)
+                        }
+                        InstructionData::UnaryConst {
+                            opcode: Opcode::Vconst,
+                            constant_handle,
+                        } => {
+                            // If each byte of controlling mask is 0x00 or 0xFF then
+                            // we will always bitcast our way to vselect(B8x16, I8x16, I8x16).
+                            // Bitselect operates at bit level, so the lane types don't matter.
+                            let const_data = pos.func.dfg.constants.get(constant_handle);
+                            if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
+                                return;
+                            }
+                            let new_type = B8.by(old_cond_type.bytes() as u16).unwrap();
+                            (pos.ins().raw_bitcast(new_type, args[0]), new_type)
+                        }
+                        _ => return,
+                    };
+
+                    let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
+                    let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
+                    let old_arg_type = pos.func.dfg.value_type(args[1]);
+
+                    if arg_type != old_arg_type {
+                        // Operands types must match, we need to add bitcasts.
+                        let arg1 = pos.ins().raw_bitcast(arg_type, args[1]);
+                        let arg2 = pos.ins().raw_bitcast(arg_type, args[2]);
+                        let ret = pos.ins().vselect(cond_val, arg1, arg2);
+                        pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret);
+                    } else {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .vselect(cond_val, args[1], args[2]);
+                    }
+                }
+            }
+
+            _ => {}
+        }
+    }
+
+    struct BranchOptInfo {
+        br_inst: Inst,
+        cmp_arg: Value,
+        args: ValueList,
+        new_opcode: Opcode,
+    }
+
+    /// Fold comparisons into branch operations when possible.
+    ///
+    /// This matches against operations which compare against zero, then use the
+    /// result in a `brz` or `brnz` branch. It folds those two operations into a
+    /// single `brz` or `brnz`.
+    fn branch_opt(pos: &mut FuncCursor, inst: Inst) {
+        let mut info = if let InstructionData::Branch {
+            opcode: br_opcode,
+            args: ref br_args,
+            ..
+        } = pos.func.dfg[inst]
+        {
+            let first_arg = {
+                let args = pos.func.dfg.inst_args(inst);
+                args[0]
+            };
+
+            let icmp_inst =
+                if let ValueDef::Result(icmp_inst, _) = pos.func.dfg.value_def(first_arg) {
+                    icmp_inst
+                } else {
+                    return;
+                };
+
+            if let InstructionData::IntCompareImm {
+                opcode: Opcode::IcmpImm,
+                arg: cmp_arg,
+                cond: cmp_cond,
+                imm: cmp_imm,
+            } = pos.func.dfg[icmp_inst]
+            {
+                let cmp_imm: i64 = cmp_imm.into();
+                if cmp_imm != 0 {
+                    return;
+                }
+
+                // icmp_imm returns non-zero when the comparison is true. So, if
+                // we're branching on zero, we need to invert the condition.
+                let cond = match br_opcode {
+                    Opcode::Brz => cmp_cond.inverse(),
+                    Opcode::Brnz => cmp_cond,
+                    _ => return,
+                };
+
+                let new_opcode = match cond {
+                    IntCC::Equal => Opcode::Brz,
+                    IntCC::NotEqual => Opcode::Brnz,
+                    _ => return,
+                };
+
+                BranchOptInfo {
+                    br_inst: inst,
+                    cmp_arg,
+                    args: br_args.clone(),
+                    new_opcode,
+                }
+            } else {
+                return;
+            }
+        } else {
+            return;
+        };
+
+        info.args.as_mut_slice(&mut pos.func.dfg.value_lists)[0] = info.cmp_arg;
+        if let InstructionData::Branch { ref mut opcode, .. } = pos.func.dfg[info.br_inst] {
+            *opcode = info.new_opcode;
+        } else {
+            panic!();
+        }
+    }
+}
+
 /// The main pre-opt pass.
 pub fn do_preopt(func: &mut Function, cfg: &mut ControlFlowGraph, isa: &dyn TargetIsa) {
    let _tt = timing::preopt();
+
    let mut pos = FuncCursor::new(func);
-    let native_word_width = isa.pointer_bytes();
+    let native_word_width = isa.pointer_bytes() as u32;
+    let mut optimizer = simplify::peephole_optimizer(isa);
+
    while let Some(block) = pos.next_block() {
        while let Some(inst) = pos.next_inst() {
-            // Apply basic simplifications.
-            simplify(&mut pos, inst, native_word_width as u32);
+            simplify::apply_all(&mut optimizer, &mut pos, inst, native_word_width);

            // Try to transform divide-by-constant into simpler operations.
            if let Some(divrem_info) = get_div_info(inst, &pos.func.dfg) {
@ -960,7 +1099,6 @@ pub fn do_preopt(func: &mut Function, cfg: &mut ControlFlowGraph, isa: &dyn Targ
                continue;
            }

-            branch_opt(&mut pos, inst);
            branch_order(&mut pos, cfg, block, inst);
        }
    }
--- a/third_party/rust/cranelift-codegen/src/timing.rs
+++ b/third_party/rust/cranelift-codegen/src/timing.rs
@ -62,6 +62,7 @@ define_passes! {
    gvn: "Global value numbering",
    licm: "Loop invariant code motion",
    unreachable_code: "Remove unreachable blocks",
+    remove_constant_phis: "Remove constant phi-nodes",

    regalloc: "Register allocation",
    ra_liveness: "RA liveness analysis",
--- a/third_party/rust/cranelift-codegen/src/value_label.rs
+++ b/third_party/rust/cranelift-codegen/src/value_label.rs
@ -18,9 +18,9 @@ use serde::{Deserialize, Serialize};
 pub struct ValueLocRange {
    /// The ValueLoc containing a ValueLabel during this range.
    pub loc: ValueLoc,
-    /// The start of the range.
+    /// The start of the range. It is an offset in the generated code.
    pub start: u32,
-    /// The end of the range.
+    /// The end of the range. It is an offset in the generated code.
    pub end: u32,
 }

@ -91,6 +91,11 @@ pub fn build_value_labels_ranges<T>(
 where
    T: From<SourceLoc> + Deref<Target = SourceLoc> + Ord + Copy,
 {
+    // FIXME(#1523): New-style backend does not yet have debug info.
+    if isa.get_mach_backend().is_some() {
+        return HashMap::new();
+    }
+
    let values_labels = build_value_labels_index::<T>(func);

    let mut blocks = func.layout.blocks().collect::<Vec<_>>();
--- a/third_party/rust/cranelift-codegen/src/verifier/mod.rs
+++ b/third_party/rust/cranelift-codegen/src/verifier/mod.rs
@ -756,10 +756,10 @@ impl<'a> Verifier<'a> {
            | UnaryIeee64 { .. }
            | UnaryBool { .. }
            | Binary { .. }
-            | BinaryImm { .. }
+            | BinaryImm8 { .. }
+            | BinaryImm64 { .. }
            | Ternary { .. }
-            | InsertLane { .. }
-            | ExtractLane { .. }
+            | TernaryImm8 { .. }
            | Shuffle { .. }
            | IntCompare { .. }
            | IntCompareImm { .. }
@ -1912,20 +1912,20 @@ impl<'a> Verifier<'a> {
                    Ok(())
                }
            }
-            ir::InstructionData::ExtractLane {
+            ir::InstructionData::BinaryImm8 {
                opcode: ir::instructions::Opcode::Extractlane,
-                lane,
+                imm: lane,
                arg,
                ..
            }
-            | ir::InstructionData::InsertLane {
+            | ir::InstructionData::TernaryImm8 {
                opcode: ir::instructions::Opcode::Insertlane,
-                lane,
+                imm: lane,
                args: [arg, _],
                ..
            } => {
                // We must be specific about the opcodes above because other instructions are using
-                // the ExtractLane/InsertLane formats.
+                // the same formats.
                let ty = self.func.dfg.value_type(arg);
                if u16::from(lane) >= ty.lane_count() {
                    errors.fatal((
--- a/third_party/rust/cranelift-codegen/src/write.rs
+++ b/third_party/rust/cranelift-codegen/src/write.rs
@ -508,7 +508,8 @@ pub fn write_operands(
            constant_handle, ..
        } => write!(w, " {}", constant_handle),
        Binary { args, .. } => write!(w, " {}, {}", args[0], args[1]),
-        BinaryImm { arg, imm, .. } => write!(w, " {}, {}", arg, imm),
+        BinaryImm8 { arg, imm, .. } => write!(w, " {}, {}", arg, imm),
+        BinaryImm64 { arg, imm, .. } => write!(w, " {}, {}", arg, imm),
        Ternary { args, .. } => write!(w, " {}, {}, {}", args[0], args[1], args[2]),
        MultiAry { ref args, .. } => {
            if args.is_empty() {
@ -518,8 +519,7 @@ pub fn write_operands(
            }
        }
        NullAry { .. } => write!(w, " "),
-        InsertLane { lane, args, .. } => write!(w, " {}, {}, {}", args[0], lane, args[1]),
-        ExtractLane { lane, arg, .. } => write!(w, " {}, {}", arg, lane),
+        TernaryImm8 { imm, args, .. } => write!(w, " {}, {}, {}", args[0], args[1], imm),
        Shuffle { mask, args, .. } => {
            let data = dfg.immediates.get(mask).expect(
                "Expected the shuffle mask to already be inserted into the immediates table",
--- a/third_party/rust/cranelift-entity/.cargo-checksum.json
+++ b/third_party/rust/cranelift-entity/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"cd1dd7e4040349ff8e5e88cbc3273c2b52cb411853933de6aea8976a1a99445f","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"96ceffbfd88fb06e3b41aa4d3087cffbbf8441d04eba7ab09662a72ab600a321","src/boxed_slice.rs":"69d539b72460c0aba1d30e0b72efb0c29d61558574d751c784794e14abf41352","src/iter.rs":"4a4d3309fe9aad14fd7702f02459f4277b4ddb50dba700e58dcc75665ffebfb3","src/keys.rs":"b8c2fba26dee15bf3d1880bb2b41e8d66fe1428d242ee6d9fd30ee94bbd0407d","src/lib.rs":"f6d738a46f1dca8b0c82a5910d86cd572a3585ab7ef9f73dac96962529069190","src/list.rs":"4bf609eb7cc7c000c18da746596d5fcc67eece3f919ee2d76e19f6ac371640d1","src/map.rs":"546b36be4cbbd2423bacbed69cbe114c63538c3f635e15284ab8e4223e717705","src/packed_option.rs":"dccb3dd6fc87eba0101de56417f21cab67a4394831df9fa41e3bbddb70cdf694","src/primary.rs":"30d5e2ab8427fd2b2c29da395812766049e3c40845cc887af3ee233dba91a063","src/set.rs":"b040054b8baa0599e64df9ee841640688e2a73b6eabbdc5a4f15334412db052a","src/sparse.rs":"536e31fdcf64450526f5e5b85e97406c26b998bc7e0d8161b6b449c24265449f"},"package":null}
+{"files":{"Cargo.toml":"c4ee5d42f3f76a1458ec0d97b5777569906819fe5b4002512de0e69814754c53","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"96ceffbfd88fb06e3b41aa4d3087cffbbf8441d04eba7ab09662a72ab600a321","src/boxed_slice.rs":"69d539b72460c0aba1d30e0b72efb0c29d61558574d751c784794e14abf41352","src/iter.rs":"4a4d3309fe9aad14fd7702f02459f4277b4ddb50dba700e58dcc75665ffebfb3","src/keys.rs":"b8c2fba26dee15bf3d1880bb2b41e8d66fe1428d242ee6d9fd30ee94bbd0407d","src/lib.rs":"5ecb434f18c343f68c7080514c71f8c79c21952d1774beffa1bf348b6dd77b05","src/list.rs":"4bf609eb7cc7c000c18da746596d5fcc67eece3f919ee2d76e19f6ac371640d1","src/map.rs":"546b36be4cbbd2423bacbed69cbe114c63538c3f635e15284ab8e4223e717705","src/packed_option.rs":"d931ba5ce07a5c77c8a62bb07316db21c101bc3fa1eb6ffd396f8a8944958185","src/primary.rs":"30d5e2ab8427fd2b2c29da395812766049e3c40845cc887af3ee233dba91a063","src/set.rs":"b040054b8baa0599e64df9ee841640688e2a73b6eabbdc5a4f15334412db052a","src/sparse.rs":"536e31fdcf64450526f5e5b85e97406c26b998bc7e0d8161b6b449c24265449f"},"package":null}
--- a/third_party/rust/cranelift-entity/Cargo.toml
+++ b/third_party/rust/cranelift-entity/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-entity"
-version = "0.63.0"
+version = "0.64.0"
 description = "Data structures using entity references as mapping keys"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-entity"
--- a/third_party/rust/cranelift-entity/src/lib.rs
+++ b/third_party/rust/cranelift-entity/src/lib.rs
@ -85,6 +85,10 @@ macro_rules! entity_impl {
            fn reserved_value() -> $entity {
                $entity($crate::__core::u32::MAX)
            }
+
+            fn is_reserved_value(&self) -> bool {
+                self.0 == $crate::__core::u32::MAX
+            }
        }

        impl $entity {
--- a/third_party/rust/cranelift-entity/src/packed_option.rs
+++ b/third_party/rust/cranelift-entity/src/packed_option.rs
@ -11,9 +11,11 @@ use core::fmt;
 use core::mem;

 /// Types that have a reserved value which can't be created any other way.
-pub trait ReservedValue: Eq {
+pub trait ReservedValue {
    /// Create an instance of the reserved value.
    fn reserved_value() -> Self;
+    /// Checks whether value is the reserved one.
+    fn is_reserved_value(&self) -> bool;
 }

 /// Packed representation of `Option<T>`.
@ -23,12 +25,12 @@ pub struct PackedOption<T: ReservedValue>(T);
 impl<T: ReservedValue> PackedOption<T> {
    /// Returns `true` if the packed option is a `None` value.
    pub fn is_none(&self) -> bool {
-        self.0 == T::reserved_value()
+        self.0.is_reserved_value()
    }

    /// Returns `true` if the packed option is a `Some` value.
    pub fn is_some(&self) -> bool {
-        self.0 != T::reserved_value()
+        !self.0.is_reserved_value()
    }

    /// Expand the packed option into a normal `Option`.
@ -75,7 +77,7 @@ impl<T: ReservedValue> From<T> for PackedOption<T> {
    /// Convert `t` into a packed `Some(x)`.
    fn from(t: T) -> Self {
        debug_assert!(
-            t != T::reserved_value(),
+            !t.is_reserved_value(),
            "Can't make a PackedOption from the reserved value."
        );
        Self(t)
@ -123,6 +125,10 @@ mod tests {
        fn reserved_value() -> Self {
            NoC(13)
        }
+
+        fn is_reserved_value(&self) -> bool {
+            self.0 == 13
+        }
    }

    #[test]
@ -145,6 +151,10 @@ mod tests {
        fn reserved_value() -> Self {
            Ent(13)
        }
+
+        fn is_reserved_value(&self) -> bool {
+            self.0 == 13
+        }
    }

    #[test]
--- a/third_party/rust/cranelift-frontend/.cargo-checksum.json
+++ b/third_party/rust/cranelift-frontend/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"d152c6553c0091b43d9ea0cd547dc49440e6321eb792bf47fdd3245aed046513","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"dea43e8044284df50f8b8772e9b48ba8b109b45c74111ff73619775d57ad8d67","src/frontend.rs":"f750cc995c66635dab7f2b977266cf9235d984b585ab8145bdb858ea8e1b0fb4","src/lib.rs":"5197f467d1625ee2b117a168f4b1886b4b69d4250faea6618360a5adc70b4e0c","src/ssa.rs":"650d26025706cfb63935f956bca6f166b0edfa32260cd2a8c27f9b49fcc743c3","src/switch.rs":"3bf1f11817565b95edfbc9393ef2bfdeacf534264c9d44b4f93d1432b353af6c","src/variable.rs":"399437bd7d2ac11a7a748bad7dd1f6dac58824d374ec318f36367a9d077cc225"},"package":null}
+{"files":{"Cargo.toml":"084cc46ba2d09a2ee8085c37be8624b3cc249d381f1cbee6df468930ce15e415","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"dea43e8044284df50f8b8772e9b48ba8b109b45c74111ff73619775d57ad8d67","src/frontend.rs":"d1d8477572f70cc28f71424af272d9eec0adf58af657ff153c4acbbb39822a50","src/lib.rs":"5197f467d1625ee2b117a168f4b1886b4b69d4250faea6618360a5adc70b4e0c","src/ssa.rs":"650d26025706cfb63935f956bca6f166b0edfa32260cd2a8c27f9b49fcc743c3","src/switch.rs":"3bf1f11817565b95edfbc9393ef2bfdeacf534264c9d44b4f93d1432b353af6c","src/variable.rs":"399437bd7d2ac11a7a748bad7dd1f6dac58824d374ec318f36367a9d077cc225"},"package":null}
--- a/third_party/rust/cranelift-frontend/Cargo.toml
+++ b/third_party/rust/cranelift-frontend/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-frontend"
-version = "0.63.0"
+version = "0.64.0"
 description = "Cranelift IR builder helper"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-frontend"
@ -11,7 +11,7 @@ readme = "README.md"
 edition = "2018"

 [dependencies]
-cranelift-codegen = { path = "../codegen", version = "0.63.0", default-features = false }
+cranelift-codegen = { path = "../codegen", version = "0.64.0", default-features = false }
 target-lexicon = "0.10"
 log = { version = "0.4.6", default-features = false }
 hashbrown = { version = "0.7", optional = true }
--- a/third_party/rust/cranelift-frontend/src/frontend.rs
+++ b/third_party/rust/cranelift-frontend/src/frontend.rs
@ -272,6 +272,12 @@ impl<'a> FunctionBuilder<'a> {

    /// In order to use a variable in a `use_var`, you need to declare its type with this method.
    pub fn declare_var(&mut self, var: Variable, ty: Type) {
+        debug_assert_eq!(
+            self.func_ctx.types[var],
+            types::INVALID,
+            "variable {:?} is declared twice",
+            var
+        );
        self.func_ctx.types[var] = ty;
    }

@ -285,6 +291,12 @@ impl<'a> FunctionBuilder<'a> {
                    var
                )
            });
+            debug_assert_ne!(
+                ty,
+                types::INVALID,
+                "variable {:?} is used but its type has not been declared",
+                var
+            );
            self.func_ctx
                .ssa
                .use_var(self.func, var, ty, self.position.unwrap())
--- a/third_party/rust/cranelift-wasm/.cargo-checksum.json
+++ b/third_party/rust/cranelift-wasm/.cargo-checksum.json
@ -1 +1 @@
-{"files":{"Cargo.toml":"107a12d0bc1ee99c8ffd9cf746c4d06040a90bd5769fc29d36a88371d09a67b2","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"cce724251d4abc08c6492e1e25c138ab5a0d11e9ac90bc573652b18e034f56ed","src/code_translator.rs":"4b70704fd50b24cd695d0a469d92f06d4a4fc328f24247a6c7a1ba39ac301ee0","src/environ/dummy.rs":"49bce7a8eb9f21a61c12db537b51ab6bdb3d0e1eb6253084268256d96cae68a5","src/environ/mod.rs":"b6f33f619090ff497b4e22150d77a290f259716374ac2e377b73c47cd1dafe85","src/environ/spec.rs":"3a1543f99bff340c7f6bbe3f7cb8e8ec829e4139957f3c578d5b03e29df50f9e","src/func_translator.rs":"a165063eafedbb8e6b632996f747eeb49a3d6f8a70cab6d741abfc4fd9af892d","src/lib.rs":"05b9994c062faf2065046d1e4d7caffb26823816f367d77ede6918e24fcfa6b0","src/module_translator.rs":"bcdf5a84226b726a73f4be0acb0318ca89c82584460101378e73021d85bd4485","src/sections_translator.rs":"8c4c24308332c63d16fcf19693a7ecff2239e73b4752b0d3830b273fabcee9f1","src/state/func_state.rs":"b114522784984a7cc26a3549c7c17f842885e1232254de81d938f9d155f95aa6","src/state/mod.rs":"20014cb93615467b4d20321b52f67f66040417efcaa739a4804093bb559eed19","src/state/module_state.rs":"2f299b043deb806b48583fe54bbb46708f7d8a1454b7be0eb285568064e5a7f9","src/translation_utils.rs":"a1723cf6c216edd8aa845c61b80907167569f0c830344e0f2dc86a7232d45c5c","tests/wasm_testsuite.rs":"730304f139371e5ef3fd913ec271fc4db181869b447c6ed26c54313b5c31495c"},"package":null}
+{"files":{"Cargo.toml":"3dd16e5f91cb20bc9afaff9880e1035d1c33c68851f593e6f2c5a0c92e292133","LICENSE":"268872b9816f90fd8e85db5a28d33f8150ebb8dd016653fb39ef1f94f2686bc5","README.md":"cce724251d4abc08c6492e1e25c138ab5a0d11e9ac90bc573652b18e034f56ed","src/code_translator.rs":"f9befe6f5a53eede1e9937abe0bced442f8c0276996bfb4d77c27e81d4746b4f","src/environ/dummy.rs":"07b6510a7141b92769c914e37386790486f92b691beb0876b8590f2ae5489ee4","src/environ/mod.rs":"692f35d75f125f9c071f7166252f427e4bac29401356f73307c6c36e23c667fb","src/environ/spec.rs":"2ff8524cd592efdef67e5f8d06d144f7d628dee8183848ff4f5e35850f3ce550","src/func_translator.rs":"eb1fcea970407eda872984808e9a3e3a3297c2dea6e3a600ee7116ca89c7b49f","src/lib.rs":"6d3662b3f219a3f7a26f6b44b7921a19da1d892cf78f5a4434fdced5753b069f","src/module_translator.rs":"bcdf5a84226b726a73f4be0acb0318ca89c82584460101378e73021d85bd4485","src/sections_translator.rs":"db567511e273a9e383b18a15fc47f74a1247cbe13f120d7656c21660be53ab78","src/state/func_state.rs":"b114522784984a7cc26a3549c7c17f842885e1232254de81d938f9d155f95aa6","src/state/mod.rs":"20014cb93615467b4d20321b52f67f66040417efcaa739a4804093bb559eed19","src/state/module_state.rs":"3cb3d9de26ec7ccc0ba81ed82163f27648794d4d1d1162eae8eee80a3c0ac05a","src/translation_utils.rs":"20082fded6a8d3637eccbda4465355d8d9fab0a1cd8222accb10cb3e06543689","tests/wasm_testsuite.rs":"da8dedfd11918946e9cf6af68fd4826f020ef90a4e22742b1a30e61a3fb4aedd"},"package":null}
--- a/third_party/rust/cranelift-wasm/Cargo.toml
+++ b/third_party/rust/cranelift-wasm/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "cranelift-wasm"
-version = "0.63.0"
+version = "0.64.0"
 authors = ["The Cranelift Project Developers"]
 description = "Translator from WebAssembly to Cranelift IR"
 documentation = "https://docs.rs/cranelift-wasm"
@ -12,20 +12,20 @@ keywords = ["webassembly", "wasm"]
 edition = "2018"

 [dependencies]
-wasmparser = { version = "0.51.0", default-features = false }
-cranelift-codegen = { path = "../codegen", version = "0.63.0", default-features = false }
-cranelift-entity = { path = "../entity", version = "0.63.0" }
-cranelift-frontend = { path = "../frontend", version = "0.63.0", default-features = false }
+wasmparser = { version = "0.57.0", default-features = false }
+cranelift-codegen = { path = "../codegen", version = "0.64.0", default-features = false }
+cranelift-entity = { path = "../entity", version = "0.64.0" }
+cranelift-frontend = { path = "../frontend", version = "0.64.0", default-features = false }
 hashbrown = { version = "0.7", optional = true }
 log = { version = "0.4.6", default-features = false }
 serde = { version = "1.0.94", features = ["derive"], optional = true }
 thiserror = "1.0.4"

 [dev-dependencies]
-wat = "1.0.9"
+wat = "1.0.18"
 target-lexicon = "0.10"
 # Enable the riscv feature for cranelift-codegen, as some tests require it
-cranelift-codegen = { path = "../codegen", version = "0.63.0", default-features = false, features = ["riscv"] }
+cranelift-codegen = { path = "../codegen", version = "0.64.0", default-features = false, features = ["riscv"] }

 [features]
 default = ["std"]
--- a/third_party/rust/cranelift-wasm/src/code_translator.rs
+++ b/third_party/rust/cranelift-wasm/src/code_translator.rs
@ -125,7 +125,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
                GlobalVariable::Memory { gv, offset, ty } => {
                    let addr = builder.ins().global_value(environ.pointer_type(), gv);
                    let flags = ir::MemFlags::trusted();
-                    let val = state.pop1();
+                    let mut val = state.pop1();
+                    // Ensure SIMD values are cast to their default Cranelift type, I8x16.
+                    if ty.is_vector() {
+                        val = optionally_bitcast_vector(val, I8X16, builder);
+                    }
                    debug_assert_eq!(ty, builder.func.dfg.value_type(val));
                    builder.ins().store(flags, val, addr, offset);
                }
@ -357,7 +361,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
                // We signal that all the code that follows until the next End is unreachable
                frame.set_branched_to_exit();
                let return_count = if frame.is_loop() {
-                    0
+                    frame.num_param_values()
                } else {
                    frame.num_return_values()
                };
@ -1035,8 +1039,8 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::F32Le | Operator::F64Le => {
            translate_fcmp(FloatCC::LessThanOrEqual, builder, state)
        }
-        Operator::RefNull => state.push1(builder.ins().null(environ.reference_type())),
-        Operator::RefIsNull => {
+        Operator::RefNull { ty: _ } => state.push1(builder.ins().null(environ.reference_type())),
+        Operator::RefIsNull { ty: _ } => {
            let arg = state.pop1();
            let val = builder.ins().is_null(arg);
            let val_int = builder.ins().bint(I32, val);
@ -1167,23 +1171,26 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            )?);
        }
        Operator::TableGrow { table } => {
+            let table_index = TableIndex::from_u32(*table);
            let delta = state.pop1();
            let init_value = state.pop1();
            state.push1(environ.translate_table_grow(
                builder.cursor(),
-                *table,
+                table_index,
                delta,
                init_value,
            )?);
        }
        Operator::TableGet { table } => {
+            let table_index = TableIndex::from_u32(*table);
            let index = state.pop1();
-            state.push1(environ.translate_table_get(builder.cursor(), *table, index)?);
+            state.push1(environ.translate_table_get(builder.cursor(), table_index, index)?);
        }
        Operator::TableSet { table } => {
+            let table_index = TableIndex::from_u32(*table);
            let value = state.pop1();
            let index = state.pop1();
-            environ.translate_table_set(builder.cursor(), *table, value, index)?;
+            environ.translate_table_set(builder.cursor(), table_index, value, index)?;
        }
        Operator::TableCopy {
            dst_table: dst_table_index,
@ -1206,10 +1213,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            )?;
        }
        Operator::TableFill { table } => {
+            let table_index = TableIndex::from_u32(*table);
            let len = state.pop1();
            let val = state.pop1();
            let dest = state.pop1();
-            environ.translate_table_fill(builder.cursor(), *table, dest, val, len)?;
+            environ.translate_table_fill(builder.cursor(), table_index, dest, val, len)?;
        }
        Operator::TableInit {
            segment,
@ -1302,7 +1310,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            let ty = type_of(op);
            let reduced = builder.ins().ireduce(ty.lane_type(), replacement);
            let vector = optionally_bitcast_vector(vector, ty, builder);
-            state.push1(builder.ins().insertlane(vector, *lane, reduced))
+            state.push1(builder.ins().insertlane(vector, reduced, *lane))
        }
        Operator::I32x4ReplaceLane { lane }
        | Operator::I64x2ReplaceLane { lane }
@ -1310,7 +1318,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        | Operator::F64x2ReplaceLane { lane } => {
            let (vector, replacement) = state.pop2();
            let vector = optionally_bitcast_vector(vector, type_of(op), builder);
-            state.push1(builder.ins().insertlane(vector, *lane, replacement))
+            state.push1(builder.ins().insertlane(vector, replacement, *lane))
        }
        Operator::V8x16Shuffle { lanes, .. } => {
            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
@ -1375,7 +1383,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            let a = pop1_with_bitcast(state, type_of(op), builder);
            state.push1(builder.ins().ineg(a))
        }
-        Operator::I16x8Mul | Operator::I32x4Mul => {
+        Operator::I16x8Mul | Operator::I32x4Mul | Operator::I64x2Mul => {
            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
            state.push1(builder.ins().imul(a, b))
        }
@ -1402,7 +1410,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::I8x16Shl | Operator::I16x8Shl | Operator::I32x4Shl | Operator::I64x2Shl => {
            let (a, b) = state.pop2();
            let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
            // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
            // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
            let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
@ -1411,16 +1419,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::I8x16ShrU | Operator::I16x8ShrU | Operator::I32x4ShrU | Operator::I64x2ShrU => {
            let (a, b) = state.pop2();
            let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
            // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
            // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
            let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
            state.push1(builder.ins().ushr(bitcast_a, b_mod_bitwidth))
        }
-        Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS => {
+        Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS | Operator::I64x2ShrS => {
            let (a, b) = state.pop2();
            let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
            // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
            // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
            let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
@ -1435,18 +1443,12 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            // operands must match (hence the bitcast).
            state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b))
        }
-        Operator::I8x16AnyTrue
-        | Operator::I16x8AnyTrue
-        | Operator::I32x4AnyTrue
-        | Operator::I64x2AnyTrue => {
+        Operator::I8x16AnyTrue | Operator::I16x8AnyTrue | Operator::I32x4AnyTrue => {
            let a = pop1_with_bitcast(state, type_of(op), builder);
            let bool_result = builder.ins().vany_true(a);
            state.push1(builder.ins().bint(I32, bool_result))
        }
-        Operator::I8x16AllTrue
-        | Operator::I16x8AllTrue
-        | Operator::I32x4AllTrue
-        | Operator::I64x2AllTrue => {
+        Operator::I8x16AllTrue | Operator::I16x8AllTrue | Operator::I32x4AllTrue => {
            let a = pop1_with_bitcast(state, type_of(op), builder);
            let bool_result = builder.ins().vall_true(a);
            state.push1(builder.ins().bint(I32, bool_result))
@ -1542,16 +1544,12 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            let a = pop1_with_bitcast(state, I32X4, builder);
            state.push1(builder.ins().fcvt_from_sint(F32X4, a))
        }
-        Operator::I8x16Mul
-        | Operator::I64x2Mul
-        | Operator::I64x2ShrS
-        | Operator::I32x4TruncSatF32x4S
+        Operator::I32x4TruncSatF32x4S
        | Operator::I32x4TruncSatF32x4U
-        | Operator::I64x2TruncSatF64x2S
-        | Operator::I64x2TruncSatF64x2U
        | Operator::F32x4ConvertI32x4U
-        | Operator::F64x2ConvertI64x2S
-        | Operator::F64x2ConvertI64x2U { .. }
+        | Operator::I8x16Abs
+        | Operator::I16x8Abs
+        | Operator::I32x4Abs
        | Operator::I8x16NarrowI16x8S { .. }
        | Operator::I8x16NarrowI16x8U { .. }
        | Operator::I16x8NarrowI32x4S { .. }
@ -1566,6 +1564,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        | Operator::I32x4WidenHighI16x8U { .. } => {
            return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
        }
+
+        Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
+            return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
+        }
    };
    Ok(())
 }
@ -1991,8 +1993,7 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I8x16MinU
        | Operator::I8x16MaxS
        | Operator::I8x16MaxU
-        | Operator::I8x16RoundingAverageU
-        | Operator::I8x16Mul => I8X16,
+        | Operator::I8x16RoundingAverageU => I8X16,

        Operator::I16x8Splat
        | Operator::V16x8LoadSplat { .. }
@ -2063,15 +2064,12 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I64x2ExtractLane { .. }
        | Operator::I64x2ReplaceLane { .. }
        | Operator::I64x2Neg
-        | Operator::I64x2AnyTrue
-        | Operator::I64x2AllTrue
        | Operator::I64x2Shl
        | Operator::I64x2ShrS
        | Operator::I64x2ShrU
        | Operator::I64x2Add
        | Operator::I64x2Sub
-        | Operator::F64x2ConvertI64x2S
-        | Operator::F64x2ConvertI64x2U => I64X2,
+        | Operator::I64x2Mul => I64X2,

        Operator::F32x4Splat
        | Operator::F32x4ExtractLane { .. }
@ -2111,9 +2109,7 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::F64x2Mul
        | Operator::F64x2Div
        | Operator::F64x2Min
-        | Operator::F64x2Max
-        | Operator::I64x2TruncSatF64x2S
-        | Operator::I64x2TruncSatF64x2U => F64X2,
+        | Operator::F64x2Max => F64X2,

        _ => unimplemented!(
            "Currently only SIMD instructions are mapped to their return type; the \
--- a/third_party/rust/cranelift-wasm/src/environ/dummy.rs
+++ b/third_party/rust/cranelift-wasm/src/environ/dummy.rs
@ -6,7 +6,8 @@
 //! [Wasmtime]: https://github.com/bytecodealliance/wasmtime

 use crate::environ::{
-    FuncEnvironment, GlobalVariable, ModuleEnvironment, ReturnMode, TargetEnvironment, WasmResult,
+    FuncEnvironment, GlobalVariable, ModuleEnvironment, ReturnMode, TargetEnvironment,
+    WasmFuncType, WasmResult,
 };
 use crate::func_translator::FuncTranslator;
 use crate::state::ModuleTranslationState;
@ -433,7 +434,7 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
    fn translate_table_grow(
        &mut self,
        mut pos: FuncCursor,
-        _table_index: u32,
+        _table_index: TableIndex,
        _delta: ir::Value,
        _init_value: ir::Value,
    ) -> WasmResult<ir::Value> {
@ -443,7 +444,7 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
    fn translate_table_get(
        &mut self,
        mut pos: FuncCursor,
-        _table_index: u32,
+        _table_index: TableIndex,
        _index: ir::Value,
    ) -> WasmResult<ir::Value> {
        Ok(pos.ins().null(self.reference_type()))
@ -452,7 +453,7 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
    fn translate_table_set(
        &mut self,
        _pos: FuncCursor,
-        _table_index: u32,
+        _table_index: TableIndex,
        _value: ir::Value,
        _index: ir::Value,
    ) -> WasmResult<()> {
@ -476,7 +477,7 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
    fn translate_table_fill(
        &mut self,
        _pos: FuncCursor,
-        _table_index: u32,
+        _table_index: TableIndex,
        _dst: ir::Value,
        _val: ir::Value,
        _len: ir::Value,
@ -534,7 +535,7 @@ impl TargetEnvironment for DummyEnvironment {
 }

 impl<'data> ModuleEnvironment<'data> for DummyEnvironment {
-    fn declare_signature(&mut self, sig: ir::Signature) -> WasmResult<()> {
+    fn declare_signature(&mut self, _wasm: &WasmFuncType, sig: ir::Signature) -> WasmResult<()> {
        self.info.signatures.push(sig);
        Ok(())
    }
--- a/Показать больше
+++ b/Показать больше