Android-x86
Fork
Spenden

  • R/O
  • HTTP
  • SSH
  • HTTPS

external-swiftshader: Commit

external/swiftshader


Commit MetaInfo

Revision4d40271e3f09de5e6734b4223ea15b4faf758d2a (tree)
Zeit2020-02-27 04:18:20
AutorAntonio Maiorano <amaiorano@goog...>
CommiterAntonio Maiorano

Log Message

Subzero: add support for large stacks on Windows

If the stack size is > 4K, emit chkstk, which probes the stack to commit
the pages required to support the large stack.

Bug: swiftshader:25
Change-Id: I6b9f09218736ffb641cb1dbf95a1de7149633ef8
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/41608
Presubmit-Ready: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>

Ändern Zusammenfassung

Diff

--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -18,6 +18,7 @@
1818
1919 #include "gtest/gtest.h"
2020
21+#include <array>
2122 #include <cmath>
2223 #include <thread>
2324 #include <tuple>
@@ -1501,6 +1502,53 @@ TEST(ReactorUnitTests, Args_GreaterThan5Mixed)
15011502 }
15021503 }
15031504
1505+// This test was written because on Windows with Subzero, we would get a crash when executing a function
1506+// with a large number of local variables. The problem was that on Windows, 4K pages are allocated as
1507+// needed for the stack whenever an access is made in a "guard page", at which point the page is committed,
1508+// and the next 4K page becomes the guard page. If a stack access is made that's beyond the guard page,
1509+// a regular page fault occurs. To fix this, Subzero (and any compiler) now emits a call to __chkstk with
1510+// the stack size in EAX, so that it can probe the stack in 4K increments up to that size, committing the
1511+// required pages. See https://docs.microsoft.com/en-us/windows/win32/devnotes/-win32-chkstk.
1512+TEST(ReactorUnitTests, LargeStack)
1513+{
1514+#if defined(_WIN32)
1515+ // An empirically large enough value to access outside the guard pages
1516+ constexpr int ArrayByteSize = 24 * 1024;
1517+ constexpr int ArraySize = ArrayByteSize / sizeof(int32_t);
1518+
1519+ FunctionT<void(int32_t * v)> function;
1520+ {
1521+ // Allocate a stack array large enough that writing to the first element will reach beyond
1522+ // the guard page.
1523+ Array<Int, ArraySize> largeStackArray;
1524+ for(int i = 0; i < ArraySize; ++i)
1525+ {
1526+ largeStackArray[i] = i;
1527+ }
1528+
1529+ Pointer<Int> in = function.Arg<0>();
1530+ for(int i = 0; i < ArraySize; ++i)
1531+ {
1532+ in[i] = largeStackArray[i];
1533+ }
1534+ }
1535+
1536+ auto routine = function("one");
1537+ std::array<int32_t, ArraySize> v;
1538+
1539+ // Run this in a thread, so that we get the default reserved stack size (8K on Win64).
1540+ std::thread t([&] {
1541+ routine(v.data());
1542+ });
1543+ t.join();
1544+
1545+ for(int i = 0; i < ArraySize; ++i)
1546+ {
1547+ EXPECT_EQ(v[i], i);
1548+ }
1549+#endif
1550+}
1551+
15041552 TEST(ReactorUnitTests, Call)
15051553 {
15061554 struct Class
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -17,6 +17,10 @@
1717
1818 #include "IceTargetLoweringX8632Traits.h"
1919
20+#if defined(SUBZERO_USE_MICROSOFT_ABI)
21+extern "C" void _chkstk();
22+#endif
23+
2024 namespace X8632 {
2125 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
2226 return ::Ice::X8632::TargetX8632::create(Func);
@@ -402,6 +406,32 @@ void TargetX8632::emitSandboxedReturn() {
402406 lowerIndirectJump(T_ecx);
403407 }
404408
409+void TargetX8632::emitStackProbe(size_t StackSizeBytes) {
410+#if defined(SUBZERO_USE_MICROSOFT_ABI)
411+ if (StackSizeBytes >= 4096) {
412+ // _chkstk on Win32 is actually __alloca_probe, which adjusts ESP by the
413+ // stack amount specified in EAX, so we save ESP in ECX, and restore them
414+ // both after the call.
415+
416+ Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
417+ Variable *ESP = makeReg(IceType_i32, Traits::RegisterSet::Reg_esp);
418+ Variable *ECX = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
419+
420+ _push_reg(ECX->getRegNum());
421+ _mov(ECX, ESP);
422+
423+ _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
424+
425+ auto *CallTarget =
426+ Ctx->getConstantInt32(reinterpret_cast<int32_t>(&_chkstk));
427+ emitCallToTarget(CallTarget, nullptr);
428+
429+ _mov(ESP, ECX);
430+ _pop_reg(ECX->getRegNum());
431+ }
432+#endif
433+}
434+
405435 // In some cases, there are x-macros tables for both high-level and low-level
406436 // instructions/operands that use the same enum key value. The tables are kept
407437 // separate to maintain a proper separation between abstraction layers. There
--- a/third_party/subzero/src/IceTargetLoweringX8632.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632.h
@@ -59,6 +59,7 @@ protected:
5959 void initSandbox();
6060 bool legalizeOptAddrForSandbox(OptAddr *Addr);
6161 void emitSandboxedReturn();
62+ void emitStackProbe(size_t StackSizeBytes);
6263 void lowerIndirectJump(Variable *JumpTarget);
6364 void emitGetIP(CfgNode *Node);
6465 Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -17,6 +17,10 @@
1717 #include "IceDefs.h"
1818 #include "IceTargetLoweringX8664Traits.h"
1919
20+#if defined(SUBZERO_USE_MICROSOFT_ABI)
21+extern "C" void __chkstk();
22+#endif
23+
2024 namespace X8664 {
2125 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
2226 return ::Ice::X8664::TargetX8664::create(Func);
@@ -758,6 +762,26 @@ void TargetX8664::emitSandboxedReturn() {
758762 }
759763 }
760764
765+void TargetX8664::emitStackProbe(size_t StackSizeBytes) {
766+#if defined(SUBZERO_USE_MICROSOFT_ABI)
767+ // Mirroring the behavior of MSVC here, which emits a _chkstk when locals are
768+ // >= 4KB, rather than the 8KB claimed by the docs.
769+ if (StackSizeBytes >= 4096) {
770+ // __chkstk on Win64 probes the stack up to RSP - EAX, but does not clobber
771+ // RSP, so we don't need to save and restore it.
772+
773+ Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
774+ _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
775+
776+ auto *CallTarget =
777+ Ctx->getConstantInt64(reinterpret_cast<int64_t>(&__chkstk));
778+ Operand *CallTargetReg =
779+ legalizeToReg(CallTarget, Traits::RegisterSet::Reg_r11);
780+ emitCallToTarget(CallTargetReg, nullptr);
781+ }
782+#endif
783+}
784+
761785 // In some cases, there are x-macros tables for both high-level and low-level
762786 // instructions/operands that use the same enum key value. The tables are kept
763787 // separate to maintain a proper separation between abstraction layers. There
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -62,6 +62,7 @@ protected:
6262 void initSandbox();
6363 bool legalizeOptAddrForSandbox(OptAddr *Addr);
6464 void emitSandboxedReturn();
65+ void emitStackProbe(size_t StackSizeBytes);
6566 void lowerIndirectJump(Variable *JumpTarget);
6667 void emitGetIP(CfgNode *Node);
6768 Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
--- a/third_party/subzero/src/IceTargetLoweringX86Base.h
+++ b/third_party/subzero/src/IceTargetLoweringX86Base.h
@@ -376,6 +376,12 @@ protected:
376376 void emitSandboxedReturn() {
377377 dispatchToConcrete(&Traits::ConcreteTarget::emitSandboxedReturn);
378378 }
379+
380+ void emitStackProbe(size_t StackSizeBytes) {
381+ dispatchToConcrete(&Traits::ConcreteTarget::emitStackProbe,
382+ std::move(StackSizeBytes));
383+ }
384+
379385 /// Emit just the call instruction (without argument or return variable
380386 /// processing), sandboxing if needed.
381387 virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) = 0;
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -1199,6 +1199,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
11991199 SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
12001200
12011201 if (SpillAreaSizeBytes) {
1202+ emitStackProbe(SpillAreaSizeBytes);
1203+
12021204 // Generate "sub stackptr, SpillAreaSizeBytes"
12031205 _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
12041206 }
Show on old repository browser