summarylogtreecommitdiffstats
path: root/make_SystemResourceMonitor.stop_more_resilient_to_errors.patch
diff options
context:
space:
mode:
Diffstat (limited to 'make_SystemResourceMonitor.stop_more_resilient_to_errors.patch')
-rw-r--r--make_SystemResourceMonitor.stop_more_resilient_to_errors.patch119
1 files changed, 119 insertions, 0 deletions
diff --git a/make_SystemResourceMonitor.stop_more_resilient_to_errors.patch b/make_SystemResourceMonitor.stop_more_resilient_to_errors.patch
new file mode 100644
index 000000000000..ff75dcfd0c86
--- /dev/null
+++ b/make_SystemResourceMonitor.stop_more_resilient_to_errors.patch
@@ -0,0 +1,119 @@
+
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1502856976 -32400
+# Node ID 8d9ae8c45dd07496f164364f24c5f43edcf2eb6e
+# Parent fa02d334033f0039ef82b3d7dc194312ba250279
+Bug 1384062 - Make SystemResourceMonitor.stop more resilient to errors. r=ahal,gps
+
+The poll() call in SystemResourceMonitor.stop might fail even though
+there is something to read from the pipe, in some corner cases, and
+python won't let us know about it. In that case, an exception is thrown,
+leaving the SystemResourceMonitor (and its callers) in a weird state. In
+practice, this leads BuildMonitor.__exit__ to recall stop, which then
+fails.
+
+So when poll() throws an exception, we pretend there's still something
+to read, and we try to read anyways. If there is something to read,
+recv() will return it, otherwise, it will throw an exception of its own,
+which we catch, pretending we're done.
+
+Furthermore, when there is nothing to read from the pipe, poll() simply
+returns False, and our loop never sets `done` to True, and we then hit
+an assert, which doesn't have its place here, so we remove it.
+
+Finally, the other end of the pipe might have died at any time, making
+sending over the pipe fail, so we also protect against that.
+
+With all these changes, it feels like the reason to backout bug 1239939
+in bug 1272782 should have been dealt with, and we can drop the timeout
+again.
+
+diff --git a/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py
+--- a/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py
++++ b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py
+@@ -284,57 +284,71 @@ class SystemResourceMonitor(object):
+ """
+ if not self._process:
+ self._stopped = True
+ return
+
+ assert self._running
+ assert not self._stopped
+
+- self._pipe.send(('terminate',))
++ try:
++ self._pipe.send(('terminate',))
++ except Exception:
++ pass
+ self._running = False
+ self._stopped = True
+
+ self.measurements = []
+
+- done = False
+-
+ # The child process will send each data sample over the pipe
+ # as a separate data structure. When it has finished sending
+ # samples, it sends a special "done" message to indicate it
+ # is finished.
+- while self._pipe.poll(1.0):
+- start_time, end_time, io_diff, cpu_diff, cpu_percent, virt_mem, \
+- swap_mem = self._pipe.recv()
++
++ # multiprocessing.Pipe is not actually a pipe on at least Linux. that
++ # has an effect on the expected outcome of reading from it when the
++ # other end of the pipe dies, leading to possibly hanging on revc()
++ # below. So we must poll().
++ def poll():
++ try:
++ return self._pipe.poll(0.1)
++ except Exception:
++ # Poll might throw an exception even though there's still
++ # data to read. That happens when the underlying system call
++ # returns both POLLERR and POLLIN, but python doesn't tell us
++ # about it. So assume there is something to read, and we'll
++ # get an exception when trying to read the data.
++ return True
++ while poll():
++ try:
++ start_time, end_time, io_diff, cpu_diff, cpu_percent, virt_mem, \
++ swap_mem = self._pipe.recv()
++ except Exception:
++ # Let's assume we're done here
++ break
+
+ # There should be nothing after the "done" message so
+ # terminate.
+ if start_time == 'done':
+- done = True
+ break
+
+ io = self._io_type(*io_diff)
+ virt = self._virt_type(*virt_mem)
+ swap = self._swap_type(*swap_mem)
+ cpu_times = [self._cpu_times_type(*v) for v in cpu_diff]
+
+ self.measurements.append(SystemResourceUsage(start_time, end_time,
+ cpu_times, cpu_percent, io, virt, swap))
+
+ # We establish a timeout so we don't hang forever if the child
+ # process has crashed.
+ self._process.join(10)
+ if self._process.is_alive():
+ self._process.terminate()
+ self._process.join(10)
+- else:
+- # We should have received a "done" message from the
+- # child indicating it shut down properly. This only
+- # happens if the child shuts down cleanly.
+- assert done
+
+ if len(self.measurements):
+ self.start_time = self.measurements[0].start
+ self.end_time = self.measurements[-1].end
+
+ # Methods to record events alongside the monitored data.
+
+ def record_event(self, name):
+