Bug 1750234 - Handle GPU process crash when rendering video and webgl on Android. r=jgilbert,jolin

Make the SurfaceAllocator save a reference to all Surfaces it has
allocated. When the connection to the remote allocator dies, mark all
of those surfaces as released. Ensure subsequent attempts to allocate
new Surfaces will connect to the new compositor process (either GPU or
parent).

For webgl, when we detect we are using a Surface that has been
released, clear the pool of Surfaces used for the swapchain so that
we automatically reallocate new ones.

For video, if we encounter an error in RemoteVideoDecoder and can see
that Surface has been released, then report a
NS_ERROR_DOM_MEDIA_NEED_NEW_DECODER in place of the potentially fatal
error code we get from the java decoder. This will ensure a new
decoder is created, which will in turn allocate a new Surface to
decode in to.

On the Android emulator (so potentially real devices too) the java
decoder doesn't actually report any errors in response to the Surface
being released. Therefore we also check for the Surface being released
in ProcessOutput(), and manually raise the NEED_NEW_ENCODER error from
there if so.

Differential Revision: https://phabricator.services.mozilla.com/D133108
This commit is contained in:
Jamie Nicol 2022-02-02 17:01:02 +00:00
Родитель 9c1604b27c
Коммит d3e80e9854
8 изменённых файлов: 134 добавлений и 37 удалений

Просмотреть файл

@ -183,6 +183,11 @@ class RemoteVideoDecoder : public RemoteDataDecoder {
MediaRawData* aSample) override {
AssertOnThread();
if (NeedsNewDecoder()) {
return DecodePromise::CreateAndReject(NS_ERROR_DOM_MEDIA_NEED_NEW_DECODER,
__func__);
}
const VideoInfo* config =
aSample->mTrackInfo ? aSample->mTrackInfo->GetAsVideoInfo() : &mConfig;
MOZ_ASSERT(config);
@ -265,6 +270,17 @@ class RemoteVideoDecoder : public RemoteDataDecoder {
UniquePtr<layers::SurfaceTextureImage::SetCurrentCallback> releaseSample(
new CompositeListener(mJavaDecoder, aSample));
// If our output surface has been released (due to the GPU process crashing)
// then request a new decoder, which will in turn allocate a new
// Surface. This is usually be handled by the Error() callback, but on some
// devices (or at least on the emulator) the java decoder does not raise an
// error when the Surface is released. So we raise this error here as well.
if (NeedsNewDecoder()) {
Error(MediaResult(NS_ERROR_DOM_MEDIA_NEED_NEW_DECODER,
RESULT_DETAIL("VideoCallBack::HandleOutput")));
return;
}
java::sdk::BufferInfo::LocalRef info = aSample->Info();
MOZ_ASSERT(info);
@ -316,6 +332,10 @@ class RemoteVideoDecoder : public RemoteDataDecoder {
}
}
bool NeedsNewDecoder() const override {
return !mSurface || mSurface->IsReleased();
}
const VideoInfo mConfig;
java::GeckoSurface::GlobalRef mSurface;
AndroidSurfaceTextureHandle mSurfaceHandle;
@ -863,8 +883,17 @@ void RemoteDataDecoder::Error(const MediaResult& aError) {
if (GetState() == State::SHUTDOWN) {
return;
}
mDecodePromise.RejectIfExists(aError, __func__);
mDrainPromise.RejectIfExists(aError, __func__);
// If we know we need a new decoder (eg because RemoteVideoDecoder's mSurface
// has been released due to a GPU process crash) then override the error to
// request a new decoder.
const MediaResult& error =
NeedsNewDecoder()
? MediaResult(NS_ERROR_DOM_MEDIA_NEED_NEW_DECODER, __func__)
: aError;
mDecodePromise.RejectIfExists(error, __func__);
mDrainPromise.RejectIfExists(error, __func__);
}
} // namespace mozilla

Просмотреть файл

@ -92,6 +92,12 @@ class RemoteDataDecoder : public MediaDataDecoder,
return mNumPendingInputs > 0;
}
// Returns true if we are in a state which requires a new decoder to be
// created. In this case all errors will be reported as
// NS_ERROR_DOM_MEDIA_NEED_NEW_DECODER to avoid reporting errors as fatal when
// they can be fixed with a new decoder.
virtual bool NeedsNewDecoder() const { return false; }
// The following members must only be accessed on mThread.
MozPromiseHolder<DecodePromise> mDecodePromise;
MozPromiseHolder<DecodePromise> mDrainPromise;

Просмотреть файл

@ -30,7 +30,8 @@ UniquePtr<SwapChainPresenter> SwapChain::Acquire(const gfx::IntSize& size) {
MOZ_ASSERT(mFactory);
std::shared_ptr<SharedSurface> surf;
if (!mPool.empty() && mPool.front()->mDesc.size != size) {
if (!mPool.empty() &&
(mPool.front()->mDesc.size != size || !mPool.front()->IsValid())) {
mPool = {};
}
if (kPoolSize && mPool.size() == kPoolSize) {

Просмотреть файл

@ -136,6 +136,10 @@ class SharedSurface {
virtual bool NeedsIndirectReads() const { return false; }
// Returns true if the surface is still valid to use. If false, the underlying
// resource has been released and we must allocate a new surface instead.
virtual bool IsValid() const { return true; };
virtual Maybe<layers::SurfaceDescriptor> ToSurfaceDescriptor() = 0;
};

Просмотреть файл

@ -246,6 +246,10 @@ bool SharedSurface_SurfaceTexture::IsBufferAvailable() const {
return mSurface->GetAvailable();
}
bool SharedSurface_SurfaceTexture::IsValid() const {
return !mSurface->IsReleased();
}
Maybe<layers::SurfaceDescriptor>
SharedSurface_SurfaceTexture::ToSurfaceDescriptor() {
return Some(layers::SurfaceTextureDescriptor(

Просмотреть файл

@ -107,6 +107,8 @@ class SharedSurface_SurfaceTexture final : public SharedSurface {
virtual void WaitForBufferOwnership() override;
virtual bool IsBufferAvailable() const override;
bool IsValid() const override;
};
class SurfaceFactory_SurfaceTexture final : public SurfaceFactory {

Просмотреть файл

@ -20,6 +20,7 @@ public final class GeckoSurface extends Surface {
private boolean mIsSingleBuffer;
private volatile boolean mIsAvailable;
private boolean mOwned = true;
private volatile boolean mIsReleased = false;
private int mMyPid;
// Locally allocated surface/texture. Do not pass it over IPC.
@ -83,6 +84,11 @@ public final class GeckoSurface extends Surface {
@Override
public void release() {
if (mIsReleased) {
return;
}
mIsReleased = true;
if (mSyncSurface != null) {
mSyncSurface.release();
final GeckoSurfaceTexture gst = GeckoSurfaceTexture.lookup(mSyncSurface.getHandle());
@ -107,6 +113,11 @@ public final class GeckoSurface extends Surface {
return mIsAvailable;
}
@WrapForJNI
public boolean isReleased() {
return mIsReleased;
}
@WrapForJNI
public void setAvailable(final boolean available) {
mIsAvailable = available;

Просмотреть файл

@ -5,8 +5,10 @@
package org.mozilla.gecko.gfx;
import android.os.IBinder;
import android.os.RemoteException;
import android.util.Log;
import android.util.SparseArray;
import org.mozilla.gecko.GeckoAppShell;
import org.mozilla.gecko.annotation.WrapForJNI;
import org.mozilla.gecko.process.GeckoProcessManager;
@ -17,77 +19,115 @@ import org.mozilla.gecko.process.GeckoServiceChildProcess;
private static ISurfaceAllocator sAllocator;
private static synchronized void ensureConnection() throws Exception {
// Keep a reference to all allocated Surfaces, so that we can release them if we lose the
// connection to the allocator service.
private static final SparseArray<GeckoSurface> sSurfaces = new SparseArray<GeckoSurface>();
private static synchronized void ensureConnection() {
if (sAllocator != null) {
return;
}
if (GeckoAppShell.isParentProcess()) {
sAllocator = GeckoProcessManager.getInstance().getSurfaceAllocator();
} else {
sAllocator = GeckoServiceChildProcess.getSurfaceAllocator();
}
try {
if (GeckoAppShell.isParentProcess()) {
sAllocator = GeckoProcessManager.getInstance().getSurfaceAllocator();
} else {
sAllocator = GeckoServiceChildProcess.getSurfaceAllocator();
}
if (sAllocator == null) {
throw new Exception("Failed to connect to surface allocator service!");
if (sAllocator == null) {
Log.w(LOGTAG, "Failed to connect to RemoteSurfaceAllocator");
return;
}
sAllocator
.asBinder()
.linkToDeath(
new IBinder.DeathRecipient() {
@Override
public void binderDied() {
Log.w(LOGTAG, "RemoteSurfaceAllocator died");
synchronized (SurfaceAllocator.class) {
// Our connection to the remote allocator has died, so all our surfaces are
// invalid. Release them all now. When their owners attempt to render in to
// them they can detect they have been released and allocate new ones instead.
for (int i = 0; i < sSurfaces.size(); i++) {
sSurfaces.valueAt(i).release();
}
sSurfaces.clear();
sAllocator = null;
}
}
},
0);
} catch (final RemoteException e) {
Log.w(LOGTAG, "Failed to connect to RemoteSurfaceAllocator", e);
sAllocator = null;
}
}
@WrapForJNI
public static GeckoSurface acquireSurface(
public static synchronized GeckoSurface acquireSurface(
final int width, final int height, final boolean singleBufferMode) {
try {
ensureConnection();
if (sAllocator == null) {
Log.w(LOGTAG, "Failed to acquire GeckoSurface: not connected");
return null;
}
if (singleBufferMode && !GeckoSurfaceTexture.isSingleBufferSupported()) {
return null;
}
final GeckoSurface surface = sAllocator.acquireSurface(width, height, singleBufferMode);
sSurfaces.put(surface.getHandle(), surface);
if (surface != null && !surface.inProcess()) {
sAllocator.configureSync(surface.initSyncSurface(width, height));
}
return surface;
} catch (final Exception e) {
} catch (final RemoteException e) {
Log.w(LOGTAG, "Failed to acquire GeckoSurface", e);
return null;
}
}
@WrapForJNI
public static void disposeSurface(final GeckoSurface surface) {
try {
ensureConnection();
} catch (final Exception e) {
Log.w(LOGTAG, "Failed to dispose surface, no connection");
public static synchronized void disposeSurface(final GeckoSurface surface) {
// If the surface has already been released (probably due to losing connection to the remote
// allocator) then there is nothing to do here.
if (surface.isReleased()) {
return;
}
// Release the SurfaceTexture on the other side
sSurfaces.remove(surface.getHandle());
// Release our Surface
surface.release();
if (sAllocator == null) {
return;
}
// Release the SurfaceTexture on the other side. If we have lost connection then do nothing, as
// there is nothing on the other side to release.
try {
sAllocator.releaseSurface(surface.getHandle());
if (sAllocator != null) {
sAllocator.releaseSurface(surface.getHandle());
}
} catch (final RemoteException e) {
Log.w(LOGTAG, "Failed to release surface texture", e);
}
// And now our Surface
try {
surface.release();
} catch (final Exception e) {
Log.w(LOGTAG, "Failed to release surface", e);
}
}
public static void sync(final int upstream) {
public static synchronized void sync(final int upstream) {
// Sync from the SurfaceTexture on the other side. If we have lost connection then do nothing,
// as there is nothing on the other side to sync from.
try {
ensureConnection();
} catch (final Exception e) {
Log.w(LOGTAG, "Failed to sync texture, no connection");
return;
}
// Release the SurfaceTexture on the other side
try {
sAllocator.sync(upstream);
if (sAllocator != null) {
sAllocator.sync(upstream);
}
} catch (final RemoteException e) {
Log.w(LOGTAG, "Failed to sync texture", e);
}