Revision | 247ba27c528c52e4a41c233c1c9a699f40e4d2a5 (tree) |
---|---|
Zeit | 2019-05-21 22:56:57 |
Autor | Peter Maydell <peter.maydell@lina...> |
Commiter | Peter Maydell |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
pci, pc, virtio: features, fixes
reconnect for vhost blk
tests for UEFI
misc other stuff
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Tue 21 May 2019 14:41:32 BST
# gpg: using RSA key 281F0DB8D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* remotes/mst/tags/for_upstream: (34 commits)
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
@@ -1484,7 +1484,7 @@ M: Michael S. Tsirkin <mst@redhat.com> | ||
1484 | 1484 | S: Supported |
1485 | 1485 | F: hw/*/*vhost* |
1486 | 1486 | F: docs/interop/vhost-user.json |
1487 | -F: docs/interop/vhost-user.txt | |
1487 | +F: docs/interop/vhost-user.rst | |
1488 | 1488 | F: contrib/vhost-user-*/ |
1489 | 1489 | F: backends/vhost-user.c |
1490 | 1490 | F: include/sysemu/vhost-user-backend.h |
@@ -433,7 +433,7 @@ vu_log_write(VuDev *dev, uint64_t address, uint64_t length) | ||
433 | 433 | page = address / VHOST_LOG_PAGE; |
434 | 434 | while (page * VHOST_LOG_PAGE < address + length) { |
435 | 435 | vu_log_page(dev->log_table, page); |
436 | - page += VHOST_LOG_PAGE; | |
436 | + page += 1; | |
437 | 437 | } |
438 | 438 | |
439 | 439 | vu_log_kick(dev); |
@@ -398,7 +398,8 @@ vub_get_features(VuDev *dev) | ||
398 | 398 | static uint64_t |
399 | 399 | vub_get_protocol_features(VuDev *dev) |
400 | 400 | { |
401 | - return 1ull << VHOST_USER_PROTOCOL_F_CONFIG; | |
401 | + return 1ull << VHOST_USER_PROTOCOL_F_CONFIG | | |
402 | + 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD; | |
402 | 403 | } |
403 | 404 | |
404 | 405 | static int |
@@ -15,4 +15,4 @@ Contents: | ||
15 | 15 | bitmaps |
16 | 16 | live-block-operations |
17 | 17 | pr-helper |
18 | - | |
18 | + vhost-user |
@@ -0,0 +1,1351 @@ | ||
1 | +=================== | |
2 | +Vhost-user Protocol | |
3 | +=================== | |
4 | +:Copyright: 2014 Virtual Open Systems Sarl. | |
5 | +:Licence: This work is licensed under the terms of the GNU GPL, | |
6 | + version 2 or later. See the COPYING file in the top-level | |
7 | + directory. | |
8 | + | |
9 | +.. contents:: Table of Contents | |
10 | + | |
11 | +Introduction | |
12 | +============ | |
13 | + | |
14 | +This protocol is aiming to complement the ``ioctl`` interface used to | |
15 | +control the vhost implementation in the Linux kernel. It implements | |
16 | +the control plane needed to establish virtqueue sharing with a user | |
17 | +space process on the same host. It uses communication over a Unix | |
18 | +domain socket to share file descriptors in the ancillary data of the | |
19 | +message. | |
20 | + | |
21 | +The protocol defines 2 sides of the communication, *master* and | |
22 | +*slave*. *Master* is the application that shares its virtqueues, in | |
23 | +our case QEMU. *Slave* is the consumer of the virtqueues. | |
24 | + | |
25 | +In the current implementation QEMU is the *master*, and the *slave* is | |
26 | +the external process consuming the virtio queues, for example a | |
27 | +software Ethernet switch running in user space, such as Snabbswitch, | |
28 | +or a block device backend processing read & write to a virtual | |
29 | +disk. In order to facilitate interoperability between various backend | |
30 | +implementations, it is recommended to follow the :ref:`Backend program | |
31 | +conventions <backend_conventions>`. | |
32 | + | |
33 | +*Master* and *slave* can be either a client (i.e. connecting) or | |
34 | +server (listening) in the socket communication. | |
35 | + | |
36 | +Message Specification | |
37 | +===================== | |
38 | + | |
39 | +.. Note:: All numbers are in the machine native byte order. | |
40 | + | |
41 | +A vhost-user message consists of 3 header fields and a payload. | |
42 | + | |
43 | ++---------+-------+------+---------+ | |
44 | +| request | flags | size | payload | | |
45 | ++---------+-------+------+---------+ | |
46 | + | |
47 | +Header | |
48 | +------ | |
49 | + | |
50 | +:request: 32-bit type of the request | |
51 | + | |
52 | +:flags: 32-bit bit field | |
53 | + | |
54 | +- Lower 2 bits are the version (currently 0x01) | |
55 | +- Bit 2 is the reply flag - needs to be sent on each reply from the slave | |
56 | +- Bit 3 is the need_reply flag - see :ref:`REPLY_ACK <reply_ack>` for | |
57 | + details. | |
58 | + | |
59 | +:size: 32-bit size of the payload | |
60 | + | |
61 | +Payload | |
62 | +------- | |
63 | + | |
64 | +Depending on the request type, **payload** can be: | |
65 | + | |
66 | +A single 64-bit integer | |
67 | +^^^^^^^^^^^^^^^^^^^^^^^ | |
68 | + | |
69 | ++-----+ | |
70 | +| u64 | | |
71 | ++-----+ | |
72 | + | |
73 | +:u64: a 64-bit unsigned integer | |
74 | + | |
75 | +A vring state description | |
76 | +^^^^^^^^^^^^^^^^^^^^^^^^^ | |
77 | + | |
78 | ++-------+-----+ | |
79 | +| index | num | | |
80 | ++-------+-----+ | |
81 | + | |
82 | +:index: a 32-bit index | |
83 | + | |
84 | +:num: a 32-bit number | |
85 | + | |
86 | +A vring address description | |
87 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
88 | + | |
89 | ++-------+-------+------+------------+------+-----------+-----+ | |
90 | +| index | flags | size | descriptor | used | available | log | | |
91 | ++-------+-------+------+------------+------+-----------+-----+ | |
92 | + | |
93 | +:index: a 32-bit vring index | |
94 | + | |
95 | +:flags: a 32-bit vring flags | |
96 | + | |
97 | +:descriptor: a 64-bit ring address of the vring descriptor table | |
98 | + | |
99 | +:used: a 64-bit ring address of the vring used ring | |
100 | + | |
101 | +:available: a 64-bit ring address of the vring available ring | |
102 | + | |
103 | +:log: a 64-bit guest address for logging | |
104 | + | |
105 | +Note that a ring address is an IOVA if ``VIRTIO_F_IOMMU_PLATFORM`` has | |
106 | +been negotiated. Otherwise it is a user address. | |
107 | + | |
108 | +Memory regions description | |
109 | +^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
110 | + | |
111 | ++-------------+---------+---------+-----+---------+ | |
112 | +| num regions | padding | region0 | ... | region7 | | |
113 | ++-------------+---------+---------+-----+---------+ | |
114 | + | |
115 | +:num regions: a 32-bit number of regions | |
116 | + | |
117 | +:padding: 32-bit | |
118 | + | |
119 | +A region is: | |
120 | + | |
121 | ++---------------+------+--------------+-------------+ | |
122 | +| guest address | size | user address | mmap offset | | |
123 | ++---------------+------+--------------+-------------+ | |
124 | + | |
125 | +:guest address: a 64-bit guest address of the region | |
126 | + | |
127 | +:size: a 64-bit size | |
128 | + | |
129 | +:user address: a 64-bit user address | |
130 | + | |
131 | +:mmap offset: 64-bit offset where region starts in the mapped memory | |
132 | + | |
133 | +Log description | |
134 | +^^^^^^^^^^^^^^^ | |
135 | + | |
136 | ++----------+------------+ | |
137 | +| log size | log offset | | |
138 | ++----------+------------+ | |
139 | + | |
140 | +:log size: size of area used for logging | |
141 | + | |
142 | +:log offset: offset from start of supplied file descriptor where | |
143 | + logging starts (i.e. where guest address 0 would be | |
144 | + logged) | |
145 | + | |
146 | +An IOTLB message | |
147 | +^^^^^^^^^^^^^^^^ | |
148 | + | |
149 | ++------+------+--------------+-------------------+------+ | |
150 | +| iova | size | user address | permissions flags | type | | |
151 | ++------+------+--------------+-------------------+------+ | |
152 | + | |
153 | +:iova: a 64-bit I/O virtual address programmed by the guest | |
154 | + | |
155 | +:size: a 64-bit size | |
156 | + | |
157 | +:user address: a 64-bit user address | |
158 | + | |
159 | +:permissions flags: an 8-bit value: | |
160 | + - 0: No access | |
161 | + - 1: Read access | |
162 | + - 2: Write access | |
163 | + - 3: Read/Write access | |
164 | + | |
165 | +:type: an 8-bit IOTLB message type: | |
166 | + - 1: IOTLB miss | |
167 | + - 2: IOTLB update | |
168 | + - 3: IOTLB invalidate | |
169 | + - 4: IOTLB access fail | |
170 | + | |
171 | +Virtio device config space | |
172 | +^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
173 | + | |
174 | ++--------+------+-------+---------+ | |
175 | +| offset | size | flags | payload | | |
176 | ++--------+------+-------+---------+ | |
177 | + | |
178 | +:offset: a 32-bit offset of virtio device's configuration space | |
179 | + | |
180 | +:size: a 32-bit configuration space access size in bytes | |
181 | + | |
182 | +:flags: a 32-bit value: | |
183 | + - 0: Vhost master messages used for writeable fields | |
184 | + - 1: Vhost master messages used for live migration | |
185 | + | |
186 | +:payload: Size bytes array holding the contents of the virtio | |
187 | + device's configuration space | |
188 | + | |
189 | +Vring area description | |
190 | +^^^^^^^^^^^^^^^^^^^^^^ | |
191 | + | |
192 | ++-----+------+--------+ | |
193 | +| u64 | size | offset | | |
194 | ++-----+------+--------+ | |
195 | + | |
196 | +:u64: a 64-bit integer contains vring index and flags | |
197 | + | |
198 | +:size: a 64-bit size of this area | |
199 | + | |
200 | +:offset: a 64-bit offset of this area from the start of the | |
201 | + supplied file descriptor | |
202 | + | |
203 | +Inflight description | |
204 | +^^^^^^^^^^^^^^^^^^^^ | |
205 | + | |
206 | ++-----------+-------------+------------+------------+ | |
207 | +| mmap size | mmap offset | num queues | queue size | | |
208 | ++-----------+-------------+------------+------------+ | |
209 | + | |
210 | +:mmap size: a 64-bit size of area to track inflight I/O | |
211 | + | |
212 | +:mmap offset: a 64-bit offset of this area from the start | |
213 | + of the supplied file descriptor | |
214 | + | |
215 | +:num queues: a 16-bit number of virtqueues | |
216 | + | |
217 | +:queue size: a 16-bit size of virtqueues | |
218 | + | |
219 | +C structure | |
220 | +----------- | |
221 | + | |
222 | +In QEMU the vhost-user message is implemented with the following struct: | |
223 | + | |
224 | +.. code:: c | |
225 | + | |
226 | + typedef struct VhostUserMsg { | |
227 | + VhostUserRequest request; | |
228 | + uint32_t flags; | |
229 | + uint32_t size; | |
230 | + union { | |
231 | + uint64_t u64; | |
232 | + struct vhost_vring_state state; | |
233 | + struct vhost_vring_addr addr; | |
234 | + VhostUserMemory memory; | |
235 | + VhostUserLog log; | |
236 | + struct vhost_iotlb_msg iotlb; | |
237 | + VhostUserConfig config; | |
238 | + VhostUserVringArea area; | |
239 | + VhostUserInflight inflight; | |
240 | + }; | |
241 | + } QEMU_PACKED VhostUserMsg; | |
242 | + | |
243 | +Communication | |
244 | +============= | |
245 | + | |
246 | +The protocol for vhost-user is based on the existing implementation of | |
247 | +vhost for the Linux Kernel. Most messages that can be sent via the | |
248 | +Unix domain socket implementing vhost-user have an equivalent ioctl to | |
249 | +the kernel implementation. | |
250 | + | |
251 | +The communication consists of *master* sending message requests and | |
252 | +*slave* sending message replies. Most of the requests don't require | |
253 | +replies. Here is a list of the ones that do: | |
254 | + | |
255 | +* ``VHOST_USER_GET_FEATURES`` | |
256 | +* ``VHOST_USER_GET_PROTOCOL_FEATURES`` | |
257 | +* ``VHOST_USER_GET_VRING_BASE`` | |
258 | +* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``) | |
259 | +* ``VHOST_USER_GET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) | |
260 | + | |
261 | +.. seealso:: | |
262 | + | |
263 | + :ref:`REPLY_ACK <reply_ack>` | |
264 | + The section on ``REPLY_ACK`` protocol extension. | |
265 | + | |
266 | +There are several messages that the master sends with file descriptors passed | |
267 | +in the ancillary data: | |
268 | + | |
269 | +* ``VHOST_USER_SET_MEM_TABLE`` | |
270 | +* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``) | |
271 | +* ``VHOST_USER_SET_LOG_FD`` | |
272 | +* ``VHOST_USER_SET_VRING_KICK`` | |
273 | +* ``VHOST_USER_SET_VRING_CALL`` | |
274 | +* ``VHOST_USER_SET_VRING_ERR`` | |
275 | +* ``VHOST_USER_SET_SLAVE_REQ_FD`` | |
276 | +* ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) | |
277 | + | |
278 | +If *master* is unable to send the full message or receives a wrong | |
279 | +reply it will close the connection. An optional reconnection mechanism | |
280 | +can be implemented. | |
281 | + | |
282 | +Any protocol extensions are gated by protocol feature bits, which | |
283 | +allows full backwards compatibility on both master and slave. As | |
284 | +older slaves don't support negotiating protocol features, a feature | |
285 | +bit was dedicated for this purpose:: | |
286 | + | |
287 | + #define VHOST_USER_F_PROTOCOL_FEATURES 30 | |
288 | + | |
289 | +Starting and stopping rings | |
290 | +--------------------------- | |
291 | + | |
292 | +Client must only process each ring when it is started. | |
293 | + | |
294 | +Client must only pass data between the ring and the backend, when the | |
295 | +ring is enabled. | |
296 | + | |
297 | +If ring is started but disabled, client must process the ring without | |
298 | +talking to the backend. | |
299 | + | |
300 | +For example, for a networking device, in the disabled state client | |
301 | +must not supply any new RX packets, but must process and discard any | |
302 | +TX packets. | |
303 | + | |
304 | +If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the | |
305 | +ring is initialized in an enabled state. | |
306 | + | |
307 | +If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is | |
308 | +initialized in a disabled state. Client must not pass data to/from the | |
309 | +backend until ring is enabled by ``VHOST_USER_SET_VRING_ENABLE`` with | |
310 | +parameter 1, or after it has been disabled by | |
311 | +``VHOST_USER_SET_VRING_ENABLE`` with parameter 0. | |
312 | + | |
313 | +Each ring is initialized in a stopped state, client must not process | |
314 | +it until ring is started, or after it has been stopped. | |
315 | + | |
316 | +Client must start ring upon receiving a kick (that is, detecting that | |
317 | +file descriptor is readable) on the descriptor specified by | |
318 | +``VHOST_USER_SET_VRING_KICK``, and stop ring upon receiving | |
319 | +``VHOST_USER_GET_VRING_BASE``. | |
320 | + | |
321 | +While processing the rings (whether they are enabled or not), client | |
322 | +must support changing some configuration aspects on the fly. | |
323 | + | |
324 | +Multiple queue support | |
325 | +---------------------- | |
326 | + | |
327 | +Multiple queue is treated as a protocol extension, hence the slave has | |
328 | +to implement protocol features first. The multiple queues feature is | |
329 | +supported only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ`` | |
330 | +(bit 0) is set. | |
331 | + | |
332 | +The max number of queue pairs the slave supports can be queried with | |
333 | +message ``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the | |
334 | +number of requested queues is bigger than that. | |
335 | + | |
336 | +As all queues share one connection, the master uses a unique index for each | |
337 | +queue in the sent message to identify a specified queue. One queue pair | |
338 | +is enabled initially. More queues are enabled dynamically, by sending | |
339 | +message ``VHOST_USER_SET_VRING_ENABLE``. | |
340 | + | |
341 | +Migration | |
342 | +--------- | |
343 | + | |
344 | +During live migration, the master may need to track the modifications | |
345 | +the slave makes to the memory mapped regions. The client should mark | |
346 | +the dirty pages in a log. Once it complies to this logging, it may | |
347 | +declare the ``VHOST_F_LOG_ALL`` vhost feature. | |
348 | + | |
349 | +To start/stop logging of data/used ring writes, server may send | |
350 | +messages ``VHOST_USER_SET_FEATURES`` with ``VHOST_F_LOG_ALL`` and | |
351 | +``VHOST_USER_SET_VRING_ADDR`` with ``VHOST_VRING_F_LOG`` in ring's | |
352 | +flags set to 1/0, respectively. | |
353 | + | |
354 | +All the modifications to memory pointed by vring "descriptor" should | |
355 | +be marked. Modifications to "used" vring should be marked if | |
356 | +``VHOST_VRING_F_LOG`` is part of ring's flags. | |
357 | + | |
358 | +Dirty pages are of size:: | |
359 | + | |
360 | + #define VHOST_LOG_PAGE 0x1000 | |
361 | + | |
362 | +The log memory fd is provided in the ancillary data of | |
363 | +``VHOST_USER_SET_LOG_BASE`` message when the slave has | |
364 | +``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature. | |
365 | + | |
366 | +The size of the log is supplied as part of ``VhostUserMsg`` which | |
367 | +should be large enough to cover all known guest addresses. Log starts | |
368 | +at the supplied offset in the supplied file descriptor. The log | |
369 | +covers from address 0 to the maximum of guest regions. In pseudo-code, | |
370 | +to mark page at ``addr`` as dirty:: | |
371 | + | |
372 | + page = addr / VHOST_LOG_PAGE | |
373 | + log[page / 8] |= 1 << page % 8 | |
374 | + | |
375 | +Where ``addr`` is the guest physical address. | |
376 | + | |
377 | +Use atomic operations, as the log may be concurrently manipulated. | |
378 | + | |
379 | +Note that when logging modifications to the used ring (when | |
380 | +``VHOST_VRING_F_LOG`` is set for this ring), ``log_guest_addr`` should | |
381 | +be used to calculate the log offset: the write to first byte of the | |
382 | +used ring is logged at this offset from log start. Also note that this | |
383 | +value might be outside the legal guest physical address range | |
384 | +(i.e. does not have to be covered by the ``VhostUserMemory`` table), but | |
385 | +the bit offset of the last byte of the ring must fall within the size | |
386 | +supplied by ``VhostUserLog``. | |
387 | + | |
388 | +``VHOST_USER_SET_LOG_FD`` is an optional message with an eventfd in | |
389 | +ancillary data, it may be used to inform the master that the log has | |
390 | +been modified. | |
391 | + | |
392 | +Once the source has finished migration, rings will be stopped by the | |
393 | +source. No further update must be done before rings are restarted. | |
394 | + | |
395 | +In postcopy migration the slave is started before all the memory has | |
396 | +been received from the source host, and care must be taken to avoid | |
397 | +accessing pages that have yet to be received. The slave opens a | |
398 | +'userfault'-fd and registers the memory with it; this fd is then | |
399 | +passed back over to the master. The master services requests on the | |
400 | +userfaultfd for pages that are accessed and when the page is available | |
401 | +it performs WAKE ioctl's on the userfaultfd to wake the stalled | |
402 | +slave. The client indicates support for this via the | |
403 | +``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature. | |
404 | + | |
405 | +Memory access | |
406 | +------------- | |
407 | + | |
408 | +The master sends a list of vhost memory regions to the slave using the | |
409 | +``VHOST_USER_SET_MEM_TABLE`` message. Each region has two base | |
410 | +addresses: a guest address and a user address. | |
411 | + | |
412 | +Messages contain guest addresses and/or user addresses to reference locations | |
413 | +within the shared memory. The mapping of these addresses works as follows. | |
414 | + | |
415 | +User addresses map to the vhost memory region containing that user address. | |
416 | + | |
417 | +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has not been negotiated: | |
418 | + | |
419 | +* Guest addresses map to the vhost memory region containing that guest | |
420 | + address. | |
421 | + | |
422 | +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated: | |
423 | + | |
424 | +* Guest addresses are also called I/O virtual addresses (IOVAs). They are | |
425 | + translated to user addresses via the IOTLB. | |
426 | + | |
427 | +* The vhost memory region guest address is not used. | |
428 | + | |
429 | +IOMMU support | |
430 | +------------- | |
431 | + | |
432 | +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated, the | |
433 | +master sends IOTLB entries update & invalidation by sending | |
434 | +``VHOST_USER_IOTLB_MSG`` requests to the slave with a ``struct | |
435 | +vhost_iotlb_msg`` as payload. For update events, the ``iotlb`` payload | |
436 | +has to be filled with the update message type (2), the I/O virtual | |
437 | +address, the size, the user virtual address, and the permissions | |
438 | +flags. Addresses and size must be within vhost memory regions set via | |
439 | +the ``VHOST_USER_SET_MEM_TABLE`` request. For invalidation events, the | |
440 | +``iotlb`` payload has to be filled with the invalidation message type | |
441 | +(3), the I/O virtual address and the size. On success, the slave is | |
442 | +expected to reply with a zero payload, non-zero otherwise. | |
443 | + | |
444 | +The slave relies on the slave communcation channel (see :ref:`Slave | |
445 | +communication <slave_communication>` section below) to send IOTLB miss | |
446 | +and access failure events, by sending ``VHOST_USER_SLAVE_IOTLB_MSG`` | |
447 | +requests to the master with a ``struct vhost_iotlb_msg`` as | |
448 | +payload. For miss events, the iotlb payload has to be filled with the | |
449 | +miss message type (1), the I/O virtual address and the permissions | |
450 | +flags. For access failure event, the iotlb payload has to be filled | |
451 | +with the access failure message type (4), the I/O virtual address and | |
452 | +the permissions flags. For synchronization purpose, the slave may | |
453 | +rely on the reply-ack feature, so the master may send a reply when | |
454 | +operation is completed if the reply-ack feature is negotiated and | |
455 | +slaves requests a reply. For miss events, completed operation means | |
456 | +either master sent an update message containing the IOTLB entry | |
457 | +containing requested address and permission, or master sent nothing if | |
458 | +the IOTLB miss message is invalid (invalid IOVA or permission). | |
459 | + | |
460 | +The master isn't expected to take the initiative to send IOTLB update | |
461 | +messages, as the slave sends IOTLB miss messages for the guest virtual | |
462 | +memory areas it needs to access. | |
463 | + | |
464 | +.. _slave_communication: | |
465 | + | |
466 | +Slave communication | |
467 | +------------------- | |
468 | + | |
469 | +An optional communication channel is provided if the slave declares | |
470 | +``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` protocol feature, to allow the | |
471 | +slave to make requests to the master. | |
472 | + | |
473 | +The fd is provided via ``VHOST_USER_SET_SLAVE_REQ_FD`` ancillary data. | |
474 | + | |
475 | +A slave may then send ``VHOST_USER_SLAVE_*`` messages to the master | |
476 | +using this fd communication channel. | |
477 | + | |
478 | +If ``VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD`` protocol feature is | |
479 | +negotiated, slave can send file descriptors (at most 8 descriptors in | |
480 | +each message) to master via ancillary data using this fd communication | |
481 | +channel. | |
482 | + | |
483 | +Inflight I/O tracking | |
484 | +--------------------- | |
485 | + | |
486 | +To support reconnecting after restart or crash, slave may need to | |
487 | +resubmit inflight I/Os. If virtqueue is processed in order, we can | |
488 | +easily achieve that by getting the inflight descriptors from | |
489 | +descriptor table (split virtqueue) or descriptor ring (packed | |
490 | +virtqueue). However, it can't work when we process descriptors | |
491 | +out-of-order because some entries which store the information of | |
492 | +inflight descriptors in available ring (split virtqueue) or descriptor | |
493 | +ring (packed virtqueue) might be overrided by new entries. To solve | |
494 | +this problem, slave need to allocate an extra buffer to store this | |
495 | +information of inflight descriptors and share it with master for | |
496 | +persistent. ``VHOST_USER_GET_INFLIGHT_FD`` and | |
497 | +``VHOST_USER_SET_INFLIGHT_FD`` are used to transfer this buffer | |
498 | +between master and slave. And the format of this buffer is described | |
499 | +below: | |
500 | + | |
501 | ++---------------+---------------+-----+---------------+ | |
502 | +| queue0 region | queue1 region | ... | queueN region | | |
503 | ++---------------+---------------+-----+---------------+ | |
504 | + | |
505 | +N is the number of available virtqueues. Slave could get it from num | |
506 | +queues field of ``VhostUserInflight``. | |
507 | + | |
508 | +For split virtqueue, queue region can be implemented as: | |
509 | + | |
510 | +.. code:: c | |
511 | + | |
512 | + typedef struct DescStateSplit { | |
513 | + /* Indicate whether this descriptor is inflight or not. | |
514 | + * Only available for head-descriptor. */ | |
515 | + uint8_t inflight; | |
516 | + | |
517 | + /* Padding */ | |
518 | + uint8_t padding[5]; | |
519 | + | |
520 | + /* Maintain a list for the last batch of used descriptors. | |
521 | + * Only available when batching is used for submitting */ | |
522 | + uint16_t next; | |
523 | + | |
524 | + /* Used to preserve the order of fetching available descriptors. | |
525 | + * Only available for head-descriptor. */ | |
526 | + uint64_t counter; | |
527 | + } DescStateSplit; | |
528 | + | |
529 | + typedef struct QueueRegionSplit { | |
530 | + /* The feature flags of this region. Now it's initialized to 0. */ | |
531 | + uint64_t features; | |
532 | + | |
533 | + /* The version of this region. It's 1 currently. | |
534 | + * Zero value indicates an uninitialized buffer */ | |
535 | + uint16_t version; | |
536 | + | |
537 | + /* The size of DescStateSplit array. It's equal to the virtqueue | |
538 | + * size. Slave could get it from queue size field of VhostUserInflight. */ | |
539 | + uint16_t desc_num; | |
540 | + | |
541 | + /* The head of list that track the last batch of used descriptors. */ | |
542 | + uint16_t last_batch_head; | |
543 | + | |
544 | + /* Store the idx value of used ring */ | |
545 | + uint16_t used_idx; | |
546 | + | |
547 | + /* Used to track the state of each descriptor in descriptor table */ | |
548 | + DescStateSplit desc[0]; | |
549 | + } QueueRegionSplit; | |
550 | + | |
551 | +To track inflight I/O, the queue region should be processed as follows: | |
552 | + | |
553 | +When receiving available buffers from the driver: | |
554 | + | |
555 | +#. Get the next available head-descriptor index from available ring, ``i`` | |
556 | + | |
557 | +#. Set ``desc[i].counter`` to the value of global counter | |
558 | + | |
559 | +#. Increase global counter by 1 | |
560 | + | |
561 | +#. Set ``desc[i].inflight`` to 1 | |
562 | + | |
563 | +When supplying used buffers to the driver: | |
564 | + | |
565 | +1. Get corresponding used head-descriptor index, i | |
566 | + | |
567 | +2. Set ``desc[i].next`` to ``last_batch_head`` | |
568 | + | |
569 | +3. Set ``last_batch_head`` to ``i`` | |
570 | + | |
571 | +#. Steps 1,2,3 may be performed repeatedly if batching is possible | |
572 | + | |
573 | +#. Increase the ``idx`` value of used ring by the size of the batch | |
574 | + | |
575 | +#. Set the ``inflight`` field of each ``DescStateSplit`` entry in the batch to 0 | |
576 | + | |
577 | +#. Set ``used_idx`` to the ``idx`` value of used ring | |
578 | + | |
579 | +When reconnecting: | |
580 | + | |
581 | +#. If the value of ``used_idx`` does not match the ``idx`` value of | |
582 | + used ring (means the inflight field of ``DescStateSplit`` entries in | |
583 | + last batch may be incorrect), | |
584 | + | |
585 | + a. Subtract the value of ``used_idx`` from the ``idx`` value of | |
586 | + used ring to get last batch size of ``DescStateSplit`` entries | |
587 | + | |
588 | + #. Set the ``inflight`` field of each ``DescStateSplit`` entry to 0 in last batch | |
589 | + list which starts from ``last_batch_head`` | |
590 | + | |
591 | + #. Set ``used_idx`` to the ``idx`` value of used ring | |
592 | + | |
593 | +#. Resubmit inflight ``DescStateSplit`` entries in order of their | |
594 | + counter value | |
595 | + | |
596 | +For packed virtqueue, queue region can be implemented as: | |
597 | + | |
598 | +.. code:: c | |
599 | + | |
600 | + typedef struct DescStatePacked { | |
601 | + /* Indicate whether this descriptor is inflight or not. | |
602 | + * Only available for head-descriptor. */ | |
603 | + uint8_t inflight; | |
604 | + | |
605 | + /* Padding */ | |
606 | + uint8_t padding; | |
607 | + | |
608 | + /* Link to the next free entry */ | |
609 | + uint16_t next; | |
610 | + | |
611 | + /* Link to the last entry of descriptor list. | |
612 | + * Only available for head-descriptor. */ | |
613 | + uint16_t last; | |
614 | + | |
615 | + /* The length of descriptor list. | |
616 | + * Only available for head-descriptor. */ | |
617 | + uint16_t num; | |
618 | + | |
619 | + /* Used to preserve the order of fetching available descriptors. | |
620 | + * Only available for head-descriptor. */ | |
621 | + uint64_t counter; | |
622 | + | |
623 | + /* The buffer id */ | |
624 | + uint16_t id; | |
625 | + | |
626 | + /* The descriptor flags */ | |
627 | + uint16_t flags; | |
628 | + | |
629 | + /* The buffer length */ | |
630 | + uint32_t len; | |
631 | + | |
632 | + /* The buffer address */ | |
633 | + uint64_t addr; | |
634 | + } DescStatePacked; | |
635 | + | |
636 | + typedef struct QueueRegionPacked { | |
637 | + /* The feature flags of this region. Now it's initialized to 0. */ | |
638 | + uint64_t features; | |
639 | + | |
640 | + /* The version of this region. It's 1 currently. | |
641 | + * Zero value indicates an uninitialized buffer */ | |
642 | + uint16_t version; | |
643 | + | |
644 | + /* The size of DescStatePacked array. It's equal to the virtqueue | |
645 | + * size. Slave could get it from queue size field of VhostUserInflight. */ | |
646 | + uint16_t desc_num; | |
647 | + | |
648 | + /* The head of free DescStatePacked entry list */ | |
649 | + uint16_t free_head; | |
650 | + | |
651 | + /* The old head of free DescStatePacked entry list */ | |
652 | + uint16_t old_free_head; | |
653 | + | |
654 | + /* The used index of descriptor ring */ | |
655 | + uint16_t used_idx; | |
656 | + | |
657 | + /* The old used index of descriptor ring */ | |
658 | + uint16_t old_used_idx; | |
659 | + | |
660 | + /* Device ring wrap counter */ | |
661 | + uint8_t used_wrap_counter; | |
662 | + | |
663 | + /* The old device ring wrap counter */ | |
664 | + uint8_t old_used_wrap_counter; | |
665 | + | |
666 | + /* Padding */ | |
667 | + uint8_t padding[7]; | |
668 | + | |
669 | + /* Used to track the state of each descriptor fetched from descriptor ring */ | |
670 | + DescStatePacked desc[0]; | |
671 | + } QueueRegionPacked; | |
672 | + | |
673 | +To track inflight I/O, the queue region should be processed as follows: | |
674 | + | |
675 | +When receiving available buffers from the driver: | |
676 | + | |
677 | +#. Get the next available descriptor entry from descriptor ring, ``d`` | |
678 | + | |
679 | +#. If ``d`` is head descriptor, | |
680 | + | |
681 | + a. Set ``desc[old_free_head].num`` to 0 | |
682 | + | |
683 | + #. Set ``desc[old_free_head].counter`` to the value of global counter | |
684 | + | |
685 | + #. Increase global counter by 1 | |
686 | + | |
687 | + #. Set ``desc[old_free_head].inflight`` to 1 | |
688 | + | |
689 | +#. If ``d`` is last descriptor, set ``desc[old_free_head].last`` to | |
690 | + ``free_head`` | |
691 | + | |
692 | +#. Increase ``desc[old_free_head].num`` by 1 | |
693 | + | |
694 | +#. Set ``desc[free_head].addr``, ``desc[free_head].len``, | |
695 | + ``desc[free_head].flags``, ``desc[free_head].id`` to ``d.addr``, | |
696 | + ``d.len``, ``d.flags``, ``d.id`` | |
697 | + | |
698 | +#. Set ``free_head`` to ``desc[free_head].next`` | |
699 | + | |
700 | +#. If ``d`` is last descriptor, set ``old_free_head`` to ``free_head`` | |
701 | + | |
702 | +When supplying used buffers to the driver: | |
703 | + | |
704 | +1. Get corresponding used head-descriptor entry from descriptor ring, | |
705 | + ``d`` | |
706 | + | |
707 | +2. Get corresponding ``DescStatePacked`` entry, ``e`` | |
708 | + | |
709 | +3. Set ``desc[e.last].next`` to ``free_head`` | |
710 | + | |
711 | +4. Set ``free_head`` to the index of ``e`` | |
712 | + | |
713 | +#. Steps 1,2,3,4 may be performed repeatedly if batching is possible | |
714 | + | |
715 | +#. Increase ``used_idx`` by the size of the batch and update | |
716 | + ``used_wrap_counter`` if needed | |
717 | + | |
718 | +#. Update ``d.flags`` | |
719 | + | |
720 | +#. Set the ``inflight`` field of each head ``DescStatePacked`` entry | |
721 | + in the batch to 0 | |
722 | + | |
723 | +#. Set ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter`` | |
724 | + to ``free_head``, ``used_idx``, ``used_wrap_counter`` | |
725 | + | |
726 | +When reconnecting: | |
727 | + | |
728 | +#. If ``used_idx`` does not match ``old_used_idx`` (means the | |
729 | + ``inflight`` field of ``DescStatePacked`` entries in last batch may | |
730 | + be incorrect), | |
731 | + | |
732 | + a. Get the next descriptor ring entry through ``old_used_idx``, ``d`` | |
733 | + | |
734 | + #. Use ``old_used_wrap_counter`` to calculate the available flags | |
735 | + | |
736 | + #. If ``d.flags`` is not equal to the calculated flags value (means | |
737 | + slave has submitted the buffer to guest driver before crash, so | |
738 | + it has to commit the in-progres update), set ``old_free_head``, | |
739 | + ``old_used_idx``, ``old_used_wrap_counter`` to ``free_head``, | |
740 | + ``used_idx``, ``used_wrap_counter`` | |
741 | + | |
742 | +#. Set ``free_head``, ``used_idx``, ``used_wrap_counter`` to | |
743 | + ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter`` | |
744 | + (roll back any in-progress update) | |
745 | + | |
746 | +#. Set the ``inflight`` field of each ``DescStatePacked`` entry in | |
747 | + free list to 0 | |
748 | + | |
749 | +#. Resubmit inflight ``DescStatePacked`` entries in order of their | |
750 | + counter value | |
751 | + | |
752 | +Protocol features | |
753 | +----------------- | |
754 | + | |
755 | +.. code:: c | |
756 | + | |
757 | + #define VHOST_USER_PROTOCOL_F_MQ 0 | |
758 | + #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 | |
759 | + #define VHOST_USER_PROTOCOL_F_RARP 2 | |
760 | + #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 | |
761 | + #define VHOST_USER_PROTOCOL_F_MTU 4 | |
762 | + #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 | |
763 | + #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 | |
764 | + #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 | |
765 | + #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 | |
766 | + #define VHOST_USER_PROTOCOL_F_CONFIG 9 | |
767 | + #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 | |
768 | + #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 | |
769 | + #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 | |
770 | + | |
771 | +Master message types | |
772 | +-------------------- | |
773 | + | |
774 | +``VHOST_USER_GET_FEATURES`` | |
775 | + :id: 1 | |
776 | + :equivalent ioctl: ``VHOST_GET_FEATURES`` | |
777 | + :master payload: N/A | |
778 | + :slave payload: ``u64`` | |
779 | + | |
780 | + Get from the underlying vhost implementation the features bitmask. | |
781 | + Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals slave support | |
782 | + for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and | |
783 | + ``VHOST_USER_SET_PROTOCOL_FEATURES``. | |
784 | + | |
785 | +``VHOST_USER_SET_FEATURES`` | |
786 | + :id: 2 | |
787 | + :equivalent ioctl: ``VHOST_SET_FEATURES`` | |
788 | + :master payload: ``u64`` | |
789 | + | |
790 | + Enable features in the underlying vhost implementation using a | |
791 | + bitmask. Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals | |
792 | + slave support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and | |
793 | + ``VHOST_USER_SET_PROTOCOL_FEATURES``. | |
794 | + | |
795 | +``VHOST_USER_GET_PROTOCOL_FEATURES`` | |
796 | + :id: 15 | |
797 | + :equivalent ioctl: ``VHOST_GET_FEATURES`` | |
798 | + :master payload: N/A | |
799 | + :slave payload: ``u64`` | |
800 | + | |
801 | + Get the protocol feature bitmask from the underlying vhost | |
802 | + implementation. Only legal if feature bit | |
803 | + ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in | |
804 | + ``VHOST_USER_GET_FEATURES``. | |
805 | + | |
806 | +.. Note:: | |
807 | + Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must | |
808 | + support this message even before ``VHOST_USER_SET_FEATURES`` was | |
809 | + called. | |
810 | + | |
811 | +``VHOST_USER_SET_PROTOCOL_FEATURES`` | |
812 | + :id: 16 | |
813 | + :equivalent ioctl: ``VHOST_SET_FEATURES`` | |
814 | + :master payload: ``u64`` | |
815 | + | |
816 | + Enable protocol features in the underlying vhost implementation. | |
817 | + | |
818 | + Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in | |
819 | + ``VHOST_USER_GET_FEATURES``. | |
820 | + | |
821 | +.. Note:: | |
822 | + Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must support | |
823 | + this message even before ``VHOST_USER_SET_FEATURES`` was called. | |
824 | + | |
825 | +``VHOST_USER_SET_OWNER`` | |
826 | + :id: 3 | |
827 | + :equivalent ioctl: ``VHOST_SET_OWNER`` | |
828 | + :master payload: N/A | |
829 | + | |
830 | + Issued when a new connection is established. It sets the current | |
831 | + *master* as an owner of the session. This can be used on the *slave* | |
832 | + as a "session start" flag. | |
833 | + | |
834 | +``VHOST_USER_RESET_OWNER`` | |
835 | + :id: 4 | |
836 | + :master payload: N/A | |
837 | + | |
838 | +.. admonition:: Deprecated | |
839 | + | |
840 | + This is no longer used. Used to be sent to request disabling all | |
841 | + rings, but some clients interpreted it to also discard connection | |
842 | + state (this interpretation would lead to bugs). It is recommended | |
843 | + that clients either ignore this message, or use it to disable all | |
844 | + rings. | |
845 | + | |
846 | +``VHOST_USER_SET_MEM_TABLE`` | |
847 | + :id: 5 | |
848 | + :equivalent ioctl: ``VHOST_SET_MEM_TABLE`` | |
849 | + :master payload: memory regions description | |
850 | + :slave payload: (postcopy only) memory regions description | |
851 | + | |
852 | + Sets the memory map regions on the slave so it can translate the | |
853 | + vring addresses. In the ancillary data there is an array of file | |
854 | + descriptors for each memory mapped region. The size and ordering of | |
855 | + the fds matches the number and ordering of memory regions. | |
856 | + | |
857 | + When ``VHOST_USER_POSTCOPY_LISTEN`` has been received, | |
858 | + ``SET_MEM_TABLE`` replies with the bases of the memory mapped | |
859 | + regions to the master. The slave must have mmap'd the regions but | |
860 | + not yet accessed them and should not yet generate a userfault | |
861 | + event. | |
862 | + | |
863 | +.. Note:: | |
864 | + ``NEED_REPLY_MASK`` is not set in this case. QEMU will then | |
865 | + reply back to the list of mappings with an empty | |
866 | + ``VHOST_USER_SET_MEM_TABLE`` as an acknowledgement; only upon | |
867 | + reception of this message may the guest start accessing the memory | |
868 | + and generating faults. | |
869 | + | |
870 | +``VHOST_USER_SET_LOG_BASE`` | |
871 | + :id: 6 | |
872 | + :equivalent ioctl: ``VHOST_SET_LOG_BASE`` | |
873 | + :master payload: u64 | |
874 | + :slave payload: N/A | |
875 | + | |
876 | + Sets logging shared memory space. | |
877 | + | |
878 | + When slave has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature, | |
879 | + the log memory fd is provided in the ancillary data of | |
880 | + ``VHOST_USER_SET_LOG_BASE`` message, the size and offset of shared | |
881 | + memory area provided in the message. | |
882 | + | |
883 | +``VHOST_USER_SET_LOG_FD`` | |
884 | + :id: 7 | |
885 | + :equivalent ioctl: ``VHOST_SET_LOG_FD`` | |
886 | + :master payload: N/A | |
887 | + | |
888 | + Sets the logging file descriptor, which is passed as ancillary data. | |
889 | + | |
890 | +``VHOST_USER_SET_VRING_NUM`` | |
891 | + :id: 8 | |
892 | + :equivalent ioctl: ``VHOST_SET_VRING_NUM`` | |
893 | + :master payload: vring state description | |
894 | + | |
895 | + Set the size of the queue. | |
896 | + | |
897 | +``VHOST_USER_SET_VRING_ADDR`` | |
898 | + :id: 9 | |
899 | + :equivalent ioctl: ``VHOST_SET_VRING_ADDR`` | |
900 | + :master payload: vring address description | |
901 | + :slave payload: N/A | |
902 | + | |
903 | + Sets the addresses of the different aspects of the vring. | |
904 | + | |
905 | +``VHOST_USER_SET_VRING_BASE`` | |
906 | + :id: 10 | |
907 | + :equivalent ioctl: ``VHOST_SET_VRING_BASE`` | |
908 | + :master payload: vring state description | |
909 | + | |
910 | + Sets the base offset in the available vring. | |
911 | + | |
912 | +``VHOST_USER_GET_VRING_BASE`` | |
913 | + :id: 11 | |
914 | + :equivalent ioctl: ``VHOST_USER_GET_VRING_BASE`` | |
915 | + :master payload: vring state description | |
916 | + :slave payload: vring state description | |
917 | + | |
918 | + Get the available vring base offset. | |
919 | + | |
920 | +``VHOST_USER_SET_VRING_KICK`` | |
921 | + :id: 12 | |
922 | + :equivalent ioctl: ``VHOST_SET_VRING_KICK`` | |
923 | + :master payload: ``u64`` | |
924 | + | |
925 | + Set the event file descriptor for adding buffers to the vring. It is | |
926 | + passed in the ancillary data. | |
927 | + | |
928 | + Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
929 | + invalid FD flag. This flag is set when there is no file descriptor | |
930 | + in the ancillary data. This signals that polling should be used | |
931 | + instead of waiting for a kick. | |
932 | + | |
933 | +``VHOST_USER_SET_VRING_CALL`` | |
934 | + :id: 13 | |
935 | + :equivalent ioctl: ``VHOST_SET_VRING_CALL`` | |
936 | + :master payload: ``u64`` | |
937 | + | |
938 | + Set the event file descriptor to signal when buffers are used. It is | |
939 | + passed in the ancillary data. | |
940 | + | |
941 | + Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
942 | + invalid FD flag. This flag is set when there is no file descriptor | |
943 | + in the ancillary data. This signals that polling will be used | |
944 | + instead of waiting for the call. | |
945 | + | |
946 | +``VHOST_USER_SET_VRING_ERR`` | |
947 | + :id: 14 | |
948 | + :equivalent ioctl: ``VHOST_SET_VRING_ERR`` | |
949 | + :master payload: ``u64`` | |
950 | + | |
951 | + Set the event file descriptor to signal when error occurs. It is | |
952 | + passed in the ancillary data. | |
953 | + | |
954 | + Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
955 | + invalid FD flag. This flag is set when there is no file descriptor | |
956 | + in the ancillary data. | |
957 | + | |
958 | +``VHOST_USER_GET_QUEUE_NUM`` | |
959 | + :id: 17 | |
960 | + :equivalent ioctl: N/A | |
961 | + :master payload: N/A | |
962 | + :slave payload: u64 | |
963 | + | |
964 | + Query how many queues the backend supports. | |
965 | + | |
966 | + This request should be sent only when ``VHOST_USER_PROTOCOL_F_MQ`` | |
967 | + is set in queried protocol features by | |
968 | + ``VHOST_USER_GET_PROTOCOL_FEATURES``. | |
969 | + | |
970 | +``VHOST_USER_SET_VRING_ENABLE`` | |
971 | + :id: 18 | |
972 | + :equivalent ioctl: N/A | |
973 | + :master payload: vring state description | |
974 | + | |
975 | + Signal slave to enable or disable corresponding vring. | |
976 | + | |
977 | + This request should be sent only when | |
978 | + ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated. | |
979 | + | |
980 | +``VHOST_USER_SEND_RARP`` | |
981 | + :id: 19 | |
982 | + :equivalent ioctl: N/A | |
983 | + :master payload: ``u64`` | |
984 | + | |
985 | + Ask vhost user backend to broadcast a fake RARP to notify the migration | |
986 | + is terminated for guest that does not support GUEST_ANNOUNCE. | |
987 | + | |
988 | + Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is | |
989 | + present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit | |
990 | + ``VHOST_USER_PROTOCOL_F_RARP`` is present in | |
991 | + ``VHOST_USER_GET_PROTOCOL_FEATURES``. The first 6 bytes of the | |
992 | + payload contain the mac address of the guest to allow the vhost user | |
993 | + backend to construct and broadcast the fake RARP. | |
994 | + | |
995 | +``VHOST_USER_NET_SET_MTU`` | |
996 | + :id: 20 | |
997 | + :equivalent ioctl: N/A | |
998 | + :master payload: ``u64`` | |
999 | + | |
1000 | + Set host MTU value exposed to the guest. | |
1001 | + | |
1002 | + This request should be sent only when ``VIRTIO_NET_F_MTU`` feature | |
1003 | + has been successfully negotiated, ``VHOST_USER_F_PROTOCOL_FEATURES`` | |
1004 | + is present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit | |
1005 | + ``VHOST_USER_PROTOCOL_F_NET_MTU`` is present in | |
1006 | + ``VHOST_USER_GET_PROTOCOL_FEATURES``. | |
1007 | + | |
1008 | + If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must | |
1009 | + respond with zero in case the specified MTU is valid, or non-zero | |
1010 | + otherwise. | |
1011 | + | |
1012 | +``VHOST_USER_SET_SLAVE_REQ_FD`` | |
1013 | + :id: 21 | |
1014 | + :equivalent ioctl: N/A | |
1015 | + :master payload: N/A | |
1016 | + | |
1017 | + Set the socket file descriptor for slave initiated requests. It is passed | |
1018 | + in the ancillary data. | |
1019 | + | |
1020 | + This request should be sent only when | |
1021 | + ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, and protocol | |
1022 | + feature bit ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` bit is present in | |
1023 | + ``VHOST_USER_GET_PROTOCOL_FEATURES``. If | |
1024 | + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must | |
1025 | + respond with zero for success, non-zero otherwise. | |
1026 | + | |
1027 | +``VHOST_USER_IOTLB_MSG`` | |
1028 | + :id: 22 | |
1029 | + :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) | |
1030 | + :master payload: ``struct vhost_iotlb_msg`` | |
1031 | + :slave payload: ``u64`` | |
1032 | + | |
1033 | + Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. | |
1034 | + | |
1035 | + Master sends such requests to update and invalidate entries in the | |
1036 | + device IOTLB. The slave has to acknowledge the request with sending | |
1037 | + zero as ``u64`` payload for success, non-zero otherwise. | |
1038 | + | |
1039 | + This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM`` | |
1040 | + feature has been successfully negotiated. | |
1041 | + | |
1042 | +``VHOST_USER_SET_VRING_ENDIAN`` | |
1043 | + :id: 23 | |
1044 | + :equivalent ioctl: ``VHOST_SET_VRING_ENDIAN`` | |
1045 | + :master payload: vring state description | |
1046 | + | |
1047 | + Set the endianness of a VQ for legacy devices. Little-endian is | |
1048 | + indicated with state.num set to 0 and big-endian is indicated with | |
1049 | + state.num set to 1. Other values are invalid. | |
1050 | + | |
1051 | + This request should be sent only when | |
1052 | + ``VHOST_USER_PROTOCOL_F_CROSS_ENDIAN`` has been negotiated. | |
1053 | + Backends that negotiated this feature should handle both | |
1054 | + endiannesses and expect this message once (per VQ) during device | |
1055 | + configuration (ie. before the master starts the VQ). | |
1056 | + | |
1057 | +``VHOST_USER_GET_CONFIG`` | |
1058 | + :id: 24 | |
1059 | + :equivalent ioctl: N/A | |
1060 | + :master payload: virtio device config space | |
1061 | + :slave payload: virtio device config space | |
1062 | + | |
1063 | + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is | |
1064 | + submitted by the vhost-user master to fetch the contents of the | |
1065 | + virtio device configuration space, vhost-user slave's payload size | |
1066 | + MUST match master's request, vhost-user slave uses zero length of | |
1067 | + payload to indicate an error to vhost-user master. The vhost-user | |
1068 | + master may cache the contents to avoid repeated | |
1069 | + ``VHOST_USER_GET_CONFIG`` calls. | |
1070 | + | |
1071 | +``VHOST_USER_SET_CONFIG`` | |
1072 | + :id: 25 | |
1073 | + :equivalent ioctl: N/A | |
1074 | + :master payload: virtio device config space | |
1075 | + :slave payload: N/A | |
1076 | + | |
1077 | + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is | |
1078 | + submitted by the vhost-user master when the Guest changes the virtio | |
1079 | + device configuration space and also can be used for live migration | |
1080 | + on the destination host. The vhost-user slave must check the flags | |
1081 | + field, and slaves MUST NOT accept SET_CONFIG for read-only | |
1082 | + configuration space fields unless the live migration bit is set. | |
1083 | + | |
1084 | +``VHOST_USER_CREATE_CRYPTO_SESSION`` | |
1085 | + :id: 26 | |
1086 | + :equivalent ioctl: N/A | |
1087 | + :master payload: crypto session description | |
1088 | + :slave payload: crypto session description | |
1089 | + | |
1090 | + Create a session for crypto operation. The server side must return | |
1091 | + the session id, 0 or positive for success, negative for failure. | |
1092 | + This request should be sent only when | |
1093 | + ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been | |
1094 | + successfully negotiated. It's a required feature for crypto | |
1095 | + devices. | |
1096 | + | |
1097 | +``VHOST_USER_CLOSE_CRYPTO_SESSION`` | |
1098 | + :id: 27 | |
1099 | + :equivalent ioctl: N/A | |
1100 | + :master payload: ``u64`` | |
1101 | + | |
1102 | + Close a session for crypto operation which was previously | |
1103 | + created by ``VHOST_USER_CREATE_CRYPTO_SESSION``. | |
1104 | + | |
1105 | + This request should be sent only when | |
1106 | + ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been | |
1107 | + successfully negotiated. It's a required feature for crypto | |
1108 | + devices. | |
1109 | + | |
1110 | +``VHOST_USER_POSTCOPY_ADVISE`` | |
1111 | + :id: 28 | |
1112 | + :master payload: N/A | |
1113 | + :slave payload: userfault fd | |
1114 | + | |
1115 | + When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the master | |
1116 | + advises slave that a migration with postcopy enabled is underway, | |
1117 | + the slave must open a userfaultfd for later use. Note that at this | |
1118 | + stage the migration is still in precopy mode. | |
1119 | + | |
1120 | +``VHOST_USER_POSTCOPY_LISTEN`` | |
1121 | + :id: 29 | |
1122 | + :master payload: N/A | |
1123 | + | |
1124 | + Master advises slave that a transition to postcopy mode has | |
1125 | + happened. The slave must ensure that shared memory is registered | |
1126 | + with userfaultfd to cause faulting of non-present pages. | |
1127 | + | |
1128 | + This is always sent sometime after a ``VHOST_USER_POSTCOPY_ADVISE``, | |
1129 | + and thus only when ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported. | |
1130 | + | |
1131 | +``VHOST_USER_POSTCOPY_END`` | |
1132 | + :id: 30 | |
1133 | + :slave payload: ``u64`` | |
1134 | + | |
1135 | + Master advises that postcopy migration has now completed. The slave | |
1136 | + must disable the userfaultfd. The response is an acknowledgement | |
1137 | + only. | |
1138 | + | |
1139 | + When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, this message | |
1140 | + is sent at the end of the migration, after | |
1141 | + ``VHOST_USER_POSTCOPY_LISTEN`` was previously sent. | |
1142 | + | |
1143 | + The value returned is an error indication; 0 is success. | |
1144 | + | |
1145 | +``VHOST_USER_GET_INFLIGHT_FD`` | |
1146 | + :id: 31 | |
1147 | + :equivalent ioctl: N/A | |
1148 | + :master payload: inflight description | |
1149 | + | |
1150 | + When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has | |
1151 | + been successfully negotiated, this message is submitted by master to | |
1152 | + get a shared buffer from slave. The shared buffer will be used to | |
1153 | + track inflight I/O by slave. QEMU should retrieve a new one when vm | |
1154 | + reset. | |
1155 | + | |
1156 | +``VHOST_USER_SET_INFLIGHT_FD`` | |
1157 | + :id: 32 | |
1158 | + :equivalent ioctl: N/A | |
1159 | + :master payload: inflight description | |
1160 | + | |
1161 | + When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has | |
1162 | + been successfully negotiated, this message is submitted by master to | |
1163 | + send the shared inflight buffer back to slave so that slave could | |
1164 | + get inflight I/O after a crash or restart. | |
1165 | + | |
1166 | +Slave message types | |
1167 | +------------------- | |
1168 | + | |
1169 | +``VHOST_USER_SLAVE_IOTLB_MSG`` | |
1170 | + :id: 1 | |
1171 | + :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) | |
1172 | + :slave payload: ``struct vhost_iotlb_msg`` | |
1173 | + :master payload: N/A | |
1174 | + | |
1175 | + Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. | |
1176 | + Slave sends such requests to notify of an IOTLB miss, or an IOTLB | |
1177 | + access failure. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is | |
1178 | + negotiated, and slave set the ``VHOST_USER_NEED_REPLY`` flag, master | |
1179 | + must respond with zero when operation is successfully completed, or | |
1180 | + non-zero otherwise. This request should be send only when | |
1181 | + ``VIRTIO_F_IOMMU_PLATFORM`` feature has been successfully | |
1182 | + negotiated. | |
1183 | + | |
1184 | +``VHOST_USER_SLAVE_CONFIG_CHANGE_MSG`` | |
1185 | + :id: 2 | |
1186 | + :equivalent ioctl: N/A | |
1187 | + :slave payload: N/A | |
1188 | + :master payload: N/A | |
1189 | + | |
1190 | + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, vhost-user | |
1191 | + slave sends such messages to notify that the virtio device's | |
1192 | + configuration space has changed, for those host devices which can | |
1193 | + support such feature, host driver can send ``VHOST_USER_GET_CONFIG`` | |
1194 | + message to slave to get the latest content. If | |
1195 | + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and slave set the | |
1196 | + ``VHOST_USER_NEED_REPLY`` flag, master must respond with zero when | |
1197 | + operation is successfully completed, or non-zero otherwise. | |
1198 | + | |
1199 | +``VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG`` | |
1200 | + :id: 3 | |
1201 | + :equivalent ioctl: N/A | |
1202 | + :slave payload: vring area description | |
1203 | + :master payload: N/A | |
1204 | + | |
1205 | + Sets host notifier for a specified queue. The queue index is | |
1206 | + contained in the ``u64`` field of the vring area description. The | |
1207 | + host notifier is described by the file descriptor (typically it's a | |
1208 | + VFIO device fd) which is passed as ancillary data and the size | |
1209 | + (which is mmap size and should be the same as host page size) and | |
1210 | + offset (which is mmap offset) carried in the vring area | |
1211 | + description. QEMU can mmap the file descriptor based on the size and | |
1212 | + offset to get a memory range. Registering a host notifier means | |
1213 | + mapping this memory range to the VM as the specified queue's notify | |
1214 | + MMIO region. Slave sends this request to tell QEMU to de-register | |
1215 | + the existing notifier if any and register the new notifier if the | |
1216 | + request is sent with a file descriptor. | |
1217 | + | |
1218 | + This request should be sent only when | |
1219 | + ``VHOST_USER_PROTOCOL_F_HOST_NOTIFIER`` protocol feature has been | |
1220 | + successfully negotiated. | |
1221 | + | |
1222 | +.. _reply_ack: | |
1223 | + | |
1224 | +VHOST_USER_PROTOCOL_F_REPLY_ACK | |
1225 | +------------------------------- | |
1226 | + | |
1227 | +The original vhost-user specification only demands replies for certain | |
1228 | +commands. This differs from the vhost protocol implementation where | |
1229 | +commands are sent over an ``ioctl()`` call and block until the client | |
1230 | +has completed. | |
1231 | + | |
1232 | +With this protocol extension negotiated, the sender (QEMU) can set the | |
1233 | +``need_reply`` [Bit 3] flag to any command. This indicates that the | |
1234 | +client MUST respond with a Payload ``VhostUserMsg`` indicating success | |
1235 | +or failure. The payload should be set to zero on success or non-zero | |
1236 | +on failure, unless the message already has an explicit reply body. | |
1237 | + | |
1238 | +The response payload gives QEMU a deterministic indication of the result | |
1239 | +of the command. Today, QEMU is expected to terminate the main vhost-user | |
1240 | +loop upon receiving such errors. In future, qemu could be taught to be more | |
1241 | +resilient for selective requests. | |
1242 | + | |
1243 | +For the message types that already solicit a reply from the client, | |
1244 | +the presence of ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` or need_reply bit | |
1245 | +being set brings no behavioural change. (See the Communication_ | |
1246 | +section for details.) | |
1247 | + | |
1248 | +.. _backend_conventions: | |
1249 | + | |
1250 | +Backend program conventions | |
1251 | +=========================== | |
1252 | + | |
1253 | +vhost-user backends can provide various devices & services and may | |
1254 | +need to be configured manually depending on the use case. However, it | |
1255 | +is a good idea to follow the conventions listed here when | |
1256 | +possible. Users, QEMU or libvirt, can then rely on some common | |
1257 | +behaviour to avoid heterogenous configuration and management of the | |
1258 | +backend programs and facilitate interoperability. | |
1259 | + | |
1260 | +Each backend installed on a host system should come with at least one | |
1261 | +JSON file that conforms to the vhost-user.json schema. Each file | |
1262 | +informs the management applications about the backend type, and binary | |
1263 | +location. In addition, it defines rules for management apps for | |
1264 | +picking the highest priority backend when multiple match the search | |
1265 | +criteria (see ``@VhostUserBackend`` documentation in the schema file). | |
1266 | + | |
1267 | +If the backend is not capable of enabling a requested feature on the | |
1268 | +host (such as 3D acceleration with virgl), or the initialization | |
1269 | +failed, the backend should fail to start early and exit with a status | |
1270 | +!= 0. It may also print a message to stderr for further details. | |
1271 | + | |
1272 | +The backend program must not daemonize itself, but it may be | |
1273 | +daemonized by the management layer. It may also have a restricted | |
1274 | +access to the system. | |
1275 | + | |
1276 | +File descriptors 0, 1 and 2 will exist, and have regular | |
1277 | +stdin/stdout/stderr usage (they may have been redirected to /dev/null | |
1278 | +by the management layer, or to a log handler). | |
1279 | + | |
1280 | +The backend program must end (as quickly and cleanly as possible) when | |
1281 | +the SIGTERM signal is received. Eventually, it may receive SIGKILL by | |
1282 | +the management layer after a few seconds. | |
1283 | + | |
1284 | +The following command line options have an expected behaviour. They | |
1285 | +are mandatory, unless explicitly said differently: | |
1286 | + | |
1287 | +--socket-path=PATH | |
1288 | + | |
1289 | + This option specify the location of the vhost-user Unix domain socket. | |
1290 | + It is incompatible with --fd. | |
1291 | + | |
1292 | +--fd=FDNUM | |
1293 | + | |
1294 | + When this argument is given, the backend program is started with the | |
1295 | + vhost-user socket as file descriptor FDNUM. It is incompatible with | |
1296 | + --socket-path. | |
1297 | + | |
1298 | +--print-capabilities | |
1299 | + | |
1300 | + Output to stdout the backend capabilities in JSON format, and then | |
1301 | + exit successfully. Other options and arguments should be ignored, and | |
1302 | + the backend program should not perform its normal function. The | |
1303 | + capabilities can be reported dynamically depending on the host | |
1304 | + capabilities. | |
1305 | + | |
1306 | +The JSON output is described in the ``vhost-user.json`` schema, by | |
1307 | +```@VHostUserBackendCapabilities``. Example: | |
1308 | + | |
1309 | +.. code:: json | |
1310 | + | |
1311 | + { | |
1312 | + "type": "foo", | |
1313 | + "features": [ | |
1314 | + "feature-a", | |
1315 | + "feature-b" | |
1316 | + ] | |
1317 | + } | |
1318 | + | |
1319 | +vhost-user-input | |
1320 | +---------------- | |
1321 | + | |
1322 | +Command line options: | |
1323 | + | |
1324 | +--evdev-path=PATH | |
1325 | + | |
1326 | + Specify the linux input device. | |
1327 | + | |
1328 | + (optional) | |
1329 | + | |
1330 | +--no-grab | |
1331 | + | |
1332 | + Do no request exclusive access to the input device. | |
1333 | + | |
1334 | + (optional) | |
1335 | + | |
1336 | +vhost-user-gpu | |
1337 | +-------------- | |
1338 | + | |
1339 | +Command line options: | |
1340 | + | |
1341 | +--render-node=PATH | |
1342 | + | |
1343 | + Specify the GPU DRM render node. | |
1344 | + | |
1345 | + (optional) | |
1346 | + | |
1347 | +--virgl | |
1348 | + | |
1349 | + Enable virgl rendering support. | |
1350 | + | |
1351 | + (optional) |
@@ -1,1219 +0,0 @@ | ||
1 | -Vhost-user Protocol | |
2 | -=================== | |
3 | - | |
4 | -Copyright (c) 2014 Virtual Open Systems Sarl. | |
5 | - | |
6 | -This work is licensed under the terms of the GNU GPL, version 2 or later. | |
7 | -See the COPYING file in the top-level directory. | |
8 | -=================== | |
9 | - | |
10 | -This protocol is aiming to complement the ioctl interface used to control the | |
11 | -vhost implementation in the Linux kernel. It implements the control plane needed | |
12 | -to establish virtqueue sharing with a user space process on the same host. It | |
13 | -uses communication over a Unix domain socket to share file descriptors in the | |
14 | -ancillary data of the message. | |
15 | - | |
16 | -The protocol defines 2 sides of the communication, master and slave. Master is | |
17 | -the application that shares its virtqueues, in our case QEMU. Slave is the | |
18 | -consumer of the virtqueues. | |
19 | - | |
20 | -In the current implementation QEMU is the Master, and the Slave is the | |
21 | -external process consuming the virtio queues, for example a software | |
22 | -Ethernet switch running in user space, such as Snabbswitch, or a block | |
23 | -device backend processing read & write to a virtual disk. In order to | |
24 | -facilitate interoperability between various backend implementations, | |
25 | -it is recommended to follow the "Backend program conventions" | |
26 | -described in this document. | |
27 | - | |
28 | -Master and slave can be either a client (i.e. connecting) or server (listening) | |
29 | -in the socket communication. | |
30 | - | |
31 | -Message Specification | |
32 | ---------------------- | |
33 | - | |
34 | -Note that all numbers are in the machine native byte order. A vhost-user message | |
35 | -consists of 3 header fields and a payload: | |
36 | - | |
37 | ------------------------------------- | |
38 | -| request | flags | size | payload | | |
39 | ------------------------------------- | |
40 | - | |
41 | - * Request: 32-bit type of the request | |
42 | - * Flags: 32-bit bit field: | |
43 | - - Lower 2 bits are the version (currently 0x01) | |
44 | - - Bit 2 is the reply flag - needs to be sent on each reply from the slave | |
45 | - - Bit 3 is the need_reply flag - see VHOST_USER_PROTOCOL_F_REPLY_ACK for | |
46 | - details. | |
47 | - * Size - 32-bit size of the payload | |
48 | - | |
49 | - | |
50 | -Depending on the request type, payload can be: | |
51 | - | |
52 | - * A single 64-bit integer | |
53 | - ------- | |
54 | - | u64 | | |
55 | - ------- | |
56 | - | |
57 | - u64: a 64-bit unsigned integer | |
58 | - | |
59 | - * A vring state description | |
60 | - --------------- | |
61 | - | index | num | | |
62 | - --------------- | |
63 | - | |
64 | - Index: a 32-bit index | |
65 | - Num: a 32-bit number | |
66 | - | |
67 | - * A vring address description | |
68 | - -------------------------------------------------------------- | |
69 | - | index | flags | size | descriptor | used | available | log | | |
70 | - -------------------------------------------------------------- | |
71 | - | |
72 | - Index: a 32-bit vring index | |
73 | - Flags: a 32-bit vring flags | |
74 | - Descriptor: a 64-bit ring address of the vring descriptor table | |
75 | - Used: a 64-bit ring address of the vring used ring | |
76 | - Available: a 64-bit ring address of the vring available ring | |
77 | - Log: a 64-bit guest address for logging | |
78 | - | |
79 | - Note that a ring address is an IOVA if VIRTIO_F_IOMMU_PLATFORM has been | |
80 | - negotiated. Otherwise it is a user address. | |
81 | - | |
82 | - * Memory regions description | |
83 | - --------------------------------------------------- | |
84 | - | num regions | padding | region0 | ... | region7 | | |
85 | - --------------------------------------------------- | |
86 | - | |
87 | - Num regions: a 32-bit number of regions | |
88 | - Padding: 32-bit | |
89 | - | |
90 | - A region is: | |
91 | - ----------------------------------------------------- | |
92 | - | guest address | size | user address | mmap offset | | |
93 | - ----------------------------------------------------- | |
94 | - | |
95 | - Guest address: a 64-bit guest address of the region | |
96 | - Size: a 64-bit size | |
97 | - User address: a 64-bit user address | |
98 | - mmap offset: 64-bit offset where region starts in the mapped memory | |
99 | - | |
100 | -* Log description | |
101 | - --------------------------- | |
102 | - | log size | log offset | | |
103 | - --------------------------- | |
104 | - log size: size of area used for logging | |
105 | - log offset: offset from start of supplied file descriptor | |
106 | - where logging starts (i.e. where guest address 0 would be logged) | |
107 | - | |
108 | - * An IOTLB message | |
109 | - --------------------------------------------------------- | |
110 | - | iova | size | user address | permissions flags | type | | |
111 | - --------------------------------------------------------- | |
112 | - | |
113 | - IOVA: a 64-bit I/O virtual address programmed by the guest | |
114 | - Size: a 64-bit size | |
115 | - User address: a 64-bit user address | |
116 | - Permissions: an 8-bit value: | |
117 | - - 0: No access | |
118 | - - 1: Read access | |
119 | - - 2: Write access | |
120 | - - 3: Read/Write access | |
121 | - Type: an 8-bit IOTLB message type: | |
122 | - - 1: IOTLB miss | |
123 | - - 2: IOTLB update | |
124 | - - 3: IOTLB invalidate | |
125 | - - 4: IOTLB access fail | |
126 | - | |
127 | - * Virtio device config space | |
128 | - ----------------------------------- | |
129 | - | offset | size | flags | payload | | |
130 | - ----------------------------------- | |
131 | - | |
132 | - Offset: a 32-bit offset of virtio device's configuration space | |
133 | - Size: a 32-bit configuration space access size in bytes | |
134 | - Flags: a 32-bit value: | |
135 | - - 0: Vhost master messages used for writeable fields | |
136 | - - 1: Vhost master messages used for live migration | |
137 | - Payload: Size bytes array holding the contents of the virtio | |
138 | - device's configuration space | |
139 | - | |
140 | - * Vring area description | |
141 | - ----------------------- | |
142 | - | u64 | size | offset | | |
143 | - ----------------------- | |
144 | - | |
145 | - u64: a 64-bit integer contains vring index and flags | |
146 | - Size: a 64-bit size of this area | |
147 | - Offset: a 64-bit offset of this area from the start of the | |
148 | - supplied file descriptor | |
149 | - | |
150 | - * Inflight description | |
151 | - ----------------------------------------------------- | |
152 | - | mmap size | mmap offset | num queues | queue size | | |
153 | - ----------------------------------------------------- | |
154 | - | |
155 | - mmap size: a 64-bit size of area to track inflight I/O | |
156 | - mmap offset: a 64-bit offset of this area from the start | |
157 | - of the supplied file descriptor | |
158 | - num queues: a 16-bit number of virtqueues | |
159 | - queue size: a 16-bit size of virtqueues | |
160 | - | |
161 | -In QEMU the vhost-user message is implemented with the following struct: | |
162 | - | |
163 | -typedef struct VhostUserMsg { | |
164 | - VhostUserRequest request; | |
165 | - uint32_t flags; | |
166 | - uint32_t size; | |
167 | - union { | |
168 | - uint64_t u64; | |
169 | - struct vhost_vring_state state; | |
170 | - struct vhost_vring_addr addr; | |
171 | - VhostUserMemory memory; | |
172 | - VhostUserLog log; | |
173 | - struct vhost_iotlb_msg iotlb; | |
174 | - VhostUserConfig config; | |
175 | - VhostUserVringArea area; | |
176 | - VhostUserInflight inflight; | |
177 | - }; | |
178 | -} QEMU_PACKED VhostUserMsg; | |
179 | - | |
180 | -Communication | |
181 | -------------- | |
182 | - | |
183 | -The protocol for vhost-user is based on the existing implementation of vhost | |
184 | -for the Linux Kernel. Most messages that can be sent via the Unix domain socket | |
185 | -implementing vhost-user have an equivalent ioctl to the kernel implementation. | |
186 | - | |
187 | -The communication consists of master sending message requests and slave sending | |
188 | -message replies. Most of the requests don't require replies. Here is a list of | |
189 | -the ones that do: | |
190 | - | |
191 | - * VHOST_USER_GET_FEATURES | |
192 | - * VHOST_USER_GET_PROTOCOL_FEATURES | |
193 | - * VHOST_USER_GET_VRING_BASE | |
194 | - * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) | |
195 | - * VHOST_USER_GET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD) | |
196 | - | |
197 | -[ Also see the section on REPLY_ACK protocol extension. ] | |
198 | - | |
199 | -There are several messages that the master sends with file descriptors passed | |
200 | -in the ancillary data: | |
201 | - | |
202 | - * VHOST_USER_SET_MEM_TABLE | |
203 | - * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) | |
204 | - * VHOST_USER_SET_LOG_FD | |
205 | - * VHOST_USER_SET_VRING_KICK | |
206 | - * VHOST_USER_SET_VRING_CALL | |
207 | - * VHOST_USER_SET_VRING_ERR | |
208 | - * VHOST_USER_SET_SLAVE_REQ_FD | |
209 | - * VHOST_USER_SET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD) | |
210 | - | |
211 | -If Master is unable to send the full message or receives a wrong reply it will | |
212 | -close the connection. An optional reconnection mechanism can be implemented. | |
213 | - | |
214 | -Any protocol extensions are gated by protocol feature bits, | |
215 | -which allows full backwards compatibility on both master | |
216 | -and slave. | |
217 | -As older slaves don't support negotiating protocol features, | |
218 | -a feature bit was dedicated for this purpose: | |
219 | -#define VHOST_USER_F_PROTOCOL_FEATURES 30 | |
220 | - | |
221 | -Starting and stopping rings | |
222 | ----------------------- | |
223 | -Client must only process each ring when it is started. | |
224 | - | |
225 | -Client must only pass data between the ring and the | |
226 | -backend, when the ring is enabled. | |
227 | - | |
228 | -If ring is started but disabled, client must process the | |
229 | -ring without talking to the backend. | |
230 | - | |
231 | -For example, for a networking device, in the disabled state | |
232 | -client must not supply any new RX packets, but must process | |
233 | -and discard any TX packets. | |
234 | - | |
235 | -If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized | |
236 | -in an enabled state. | |
237 | - | |
238 | -If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized | |
239 | -in a disabled state. Client must not pass data to/from the backend until ring is enabled by | |
240 | -VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by | |
241 | -VHOST_USER_SET_VRING_ENABLE with parameter 0. | |
242 | - | |
243 | -Each ring is initialized in a stopped state, client must not process it until | |
244 | -ring is started, or after it has been stopped. | |
245 | - | |
246 | -Client must start ring upon receiving a kick (that is, detecting that file | |
247 | -descriptor is readable) on the descriptor specified by | |
248 | -VHOST_USER_SET_VRING_KICK, and stop ring upon receiving | |
249 | -VHOST_USER_GET_VRING_BASE. | |
250 | - | |
251 | -While processing the rings (whether they are enabled or not), client must | |
252 | -support changing some configuration aspects on the fly. | |
253 | - | |
254 | -Multiple queue support | |
255 | ----------------------- | |
256 | - | |
257 | -Multiple queue is treated as a protocol extension, hence the slave has to | |
258 | -implement protocol features first. The multiple queues feature is supported | |
259 | -only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set. | |
260 | - | |
261 | -The max number of queue pairs the slave supports can be queried with message | |
262 | -VHOST_USER_GET_QUEUE_NUM. Master should stop when the number of | |
263 | -requested queues is bigger than that. | |
264 | - | |
265 | -As all queues share one connection, the master uses a unique index for each | |
266 | -queue in the sent message to identify a specified queue. One queue pair | |
267 | -is enabled initially. More queues are enabled dynamically, by sending | |
268 | -message VHOST_USER_SET_VRING_ENABLE. | |
269 | - | |
270 | -Migration | |
271 | ---------- | |
272 | - | |
273 | -During live migration, the master may need to track the modifications | |
274 | -the slave makes to the memory mapped regions. The client should mark | |
275 | -the dirty pages in a log. Once it complies to this logging, it may | |
276 | -declare the VHOST_F_LOG_ALL vhost feature. | |
277 | - | |
278 | -To start/stop logging of data/used ring writes, server may send messages | |
279 | -VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with | |
280 | -VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively. | |
281 | - | |
282 | -All the modifications to memory pointed by vring "descriptor" should | |
283 | -be marked. Modifications to "used" vring should be marked if | |
284 | -VHOST_VRING_F_LOG is part of ring's flags. | |
285 | - | |
286 | -Dirty pages are of size: | |
287 | -#define VHOST_LOG_PAGE 0x1000 | |
288 | - | |
289 | -The log memory fd is provided in the ancillary data of | |
290 | -VHOST_USER_SET_LOG_BASE message when the slave has | |
291 | -VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature. | |
292 | - | |
293 | -The size of the log is supplied as part of VhostUserMsg | |
294 | -which should be large enough to cover all known guest | |
295 | -addresses. Log starts at the supplied offset in the | |
296 | -supplied file descriptor. | |
297 | -The log covers from address 0 to the maximum of guest | |
298 | -regions. In pseudo-code, to mark page at "addr" as dirty: | |
299 | - | |
300 | -page = addr / VHOST_LOG_PAGE | |
301 | -log[page / 8] |= 1 << page % 8 | |
302 | - | |
303 | -Where addr is the guest physical address. | |
304 | - | |
305 | -Use atomic operations, as the log may be concurrently manipulated. | |
306 | - | |
307 | -Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG | |
308 | -is set for this ring), log_guest_addr should be used to calculate the log | |
309 | -offset: the write to first byte of the used ring is logged at this offset from | |
310 | -log start. Also note that this value might be outside the legal guest physical | |
311 | -address range (i.e. does not have to be covered by the VhostUserMemory table), | |
312 | -but the bit offset of the last byte of the ring must fall within | |
313 | -the size supplied by VhostUserLog. | |
314 | - | |
315 | -VHOST_USER_SET_LOG_FD is an optional message with an eventfd in | |
316 | -ancillary data, it may be used to inform the master that the log has | |
317 | -been modified. | |
318 | - | |
319 | -Once the source has finished migration, rings will be stopped by | |
320 | -the source. No further update must be done before rings are | |
321 | -restarted. | |
322 | - | |
323 | -In postcopy migration the slave is started before all the memory has been | |
324 | -received from the source host, and care must be taken to avoid accessing pages | |
325 | -that have yet to be received. The slave opens a 'userfault'-fd and registers | |
326 | -the memory with it; this fd is then passed back over to the master. | |
327 | -The master services requests on the userfaultfd for pages that are accessed | |
328 | -and when the page is available it performs WAKE ioctl's on the userfaultfd | |
329 | -to wake the stalled slave. The client indicates support for this via the | |
330 | -VHOST_USER_PROTOCOL_F_PAGEFAULT feature. | |
331 | - | |
332 | -Memory access | |
333 | -------------- | |
334 | - | |
335 | -The master sends a list of vhost memory regions to the slave using the | |
336 | -VHOST_USER_SET_MEM_TABLE message. Each region has two base addresses: a guest | |
337 | -address and a user address. | |
338 | - | |
339 | -Messages contain guest addresses and/or user addresses to reference locations | |
340 | -within the shared memory. The mapping of these addresses works as follows. | |
341 | - | |
342 | -User addresses map to the vhost memory region containing that user address. | |
343 | - | |
344 | -When the VIRTIO_F_IOMMU_PLATFORM feature has not been negotiated: | |
345 | - | |
346 | - * Guest addresses map to the vhost memory region containing that guest | |
347 | - address. | |
348 | - | |
349 | -When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated: | |
350 | - | |
351 | - * Guest addresses are also called I/O virtual addresses (IOVAs). They are | |
352 | - translated to user addresses via the IOTLB. | |
353 | - | |
354 | - * The vhost memory region guest address is not used. | |
355 | - | |
356 | -IOMMU support | |
357 | -------------- | |
358 | - | |
359 | -When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated, the master | |
360 | -sends IOTLB entries update & invalidation by sending VHOST_USER_IOTLB_MSG | |
361 | -requests to the slave with a struct vhost_iotlb_msg as payload. For update | |
362 | -events, the iotlb payload has to be filled with the update message type (2), | |
363 | -the I/O virtual address, the size, the user virtual address, and the | |
364 | -permissions flags. Addresses and size must be within vhost memory regions set | |
365 | -via the VHOST_USER_SET_MEM_TABLE request. For invalidation events, the iotlb | |
366 | -payload has to be filled with the invalidation message type (3), the I/O virtual | |
367 | -address and the size. On success, the slave is expected to reply with a zero | |
368 | -payload, non-zero otherwise. | |
369 | - | |
370 | -The slave relies on the slave communcation channel (see "Slave communication" | |
371 | -section below) to send IOTLB miss and access failure events, by sending | |
372 | -VHOST_USER_SLAVE_IOTLB_MSG requests to the master with a struct vhost_iotlb_msg | |
373 | -as payload. For miss events, the iotlb payload has to be filled with the miss | |
374 | -message type (1), the I/O virtual address and the permissions flags. For access | |
375 | -failure event, the iotlb payload has to be filled with the access failure | |
376 | -message type (4), the I/O virtual address and the permissions flags. | |
377 | -For synchronization purpose, the slave may rely on the reply-ack feature, | |
378 | -so the master may send a reply when operation is completed if the reply-ack | |
379 | -feature is negotiated and slaves requests a reply. For miss events, completed | |
380 | -operation means either master sent an update message containing the IOTLB entry | |
381 | -containing requested address and permission, or master sent nothing if the IOTLB | |
382 | -miss message is invalid (invalid IOVA or permission). | |
383 | - | |
384 | -The master isn't expected to take the initiative to send IOTLB update messages, | |
385 | -as the slave sends IOTLB miss messages for the guest virtual memory areas it | |
386 | -needs to access. | |
387 | - | |
388 | -Slave communication | |
389 | -------------------- | |
390 | - | |
391 | -An optional communication channel is provided if the slave declares | |
392 | -VHOST_USER_PROTOCOL_F_SLAVE_REQ protocol feature, to allow the slave to make | |
393 | -requests to the master. | |
394 | - | |
395 | -The fd is provided via VHOST_USER_SET_SLAVE_REQ_FD ancillary data. | |
396 | - | |
397 | -A slave may then send VHOST_USER_SLAVE_* messages to the master | |
398 | -using this fd communication channel. | |
399 | - | |
400 | -If VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD protocol feature is negotiated, | |
401 | -slave can send file descriptors (at most 8 descriptors in each message) | |
402 | -to master via ancillary data using this fd communication channel. | |
403 | - | |
404 | -Inflight I/O tracking | |
405 | ---------------------- | |
406 | - | |
407 | -To support reconnecting after restart or crash, slave may need to resubmit | |
408 | -inflight I/Os. If virtqueue is processed in order, we can easily achieve | |
409 | -that by getting the inflight descriptors from descriptor table (split virtqueue) | |
410 | -or descriptor ring (packed virtqueue). However, it can't work when we process | |
411 | -descriptors out-of-order because some entries which store the information of | |
412 | -inflight descriptors in available ring (split virtqueue) or descriptor | |
413 | -ring (packed virtqueue) might be overrided by new entries. To solve this | |
414 | -problem, slave need to allocate an extra buffer to store this information of inflight | |
415 | -descriptors and share it with master for persistent. VHOST_USER_GET_INFLIGHT_FD and | |
416 | -VHOST_USER_SET_INFLIGHT_FD are used to transfer this buffer between master | |
417 | -and slave. And the format of this buffer is described below: | |
418 | - | |
419 | -------------------------------------------------------- | |
420 | -| queue0 region | queue1 region | ... | queueN region | | |
421 | -------------------------------------------------------- | |
422 | - | |
423 | -N is the number of available virtqueues. Slave could get it from num queues | |
424 | -field of VhostUserInflight. | |
425 | - | |
426 | -For split virtqueue, queue region can be implemented as: | |
427 | - | |
428 | -typedef struct DescStateSplit { | |
429 | - /* Indicate whether this descriptor is inflight or not. | |
430 | - * Only available for head-descriptor. */ | |
431 | - uint8_t inflight; | |
432 | - | |
433 | - /* Padding */ | |
434 | - uint8_t padding[5]; | |
435 | - | |
436 | - /* Maintain a list for the last batch of used descriptors. | |
437 | - * Only available when batching is used for submitting */ | |
438 | - uint16_t next; | |
439 | - | |
440 | - /* Used to preserve the order of fetching available descriptors. | |
441 | - * Only available for head-descriptor. */ | |
442 | - uint64_t counter; | |
443 | -} DescStateSplit; | |
444 | - | |
445 | -typedef struct QueueRegionSplit { | |
446 | - /* The feature flags of this region. Now it's initialized to 0. */ | |
447 | - uint64_t features; | |
448 | - | |
449 | - /* The version of this region. It's 1 currently. | |
450 | - * Zero value indicates an uninitialized buffer */ | |
451 | - uint16_t version; | |
452 | - | |
453 | - /* The size of DescStateSplit array. It's equal to the virtqueue | |
454 | - * size. Slave could get it from queue size field of VhostUserInflight. */ | |
455 | - uint16_t desc_num; | |
456 | - | |
457 | - /* The head of list that track the last batch of used descriptors. */ | |
458 | - uint16_t last_batch_head; | |
459 | - | |
460 | - /* Store the idx value of used ring */ | |
461 | - uint16_t used_idx; | |
462 | - | |
463 | - /* Used to track the state of each descriptor in descriptor table */ | |
464 | - DescStateSplit desc[0]; | |
465 | -} QueueRegionSplit; | |
466 | - | |
467 | -To track inflight I/O, the queue region should be processed as follows: | |
468 | - | |
469 | -When receiving available buffers from the driver: | |
470 | - | |
471 | - 1. Get the next available head-descriptor index from available ring, i | |
472 | - | |
473 | - 2. Set desc[i].counter to the value of global counter | |
474 | - | |
475 | - 3. Increase global counter by 1 | |
476 | - | |
477 | - 4. Set desc[i].inflight to 1 | |
478 | - | |
479 | -When supplying used buffers to the driver: | |
480 | - | |
481 | - 1. Get corresponding used head-descriptor index, i | |
482 | - | |
483 | - 2. Set desc[i].next to last_batch_head | |
484 | - | |
485 | - 3. Set last_batch_head to i | |
486 | - | |
487 | - 4. Steps 1,2,3 may be performed repeatedly if batching is possible | |
488 | - | |
489 | - 5. Increase the idx value of used ring by the size of the batch | |
490 | - | |
491 | - 6. Set the inflight field of each DescStateSplit entry in the batch to 0 | |
492 | - | |
493 | - 7. Set used_idx to the idx value of used ring | |
494 | - | |
495 | -When reconnecting: | |
496 | - | |
497 | - 1. If the value of used_idx does not match the idx value of used ring (means | |
498 | - the inflight field of DescStateSplit entries in last batch may be incorrect), | |
499 | - | |
500 | - (a) Subtract the value of used_idx from the idx value of used ring to get | |
501 | - last batch size of DescStateSplit entries | |
502 | - | |
503 | - (b) Set the inflight field of each DescStateSplit entry to 0 in last batch | |
504 | - list which starts from last_batch_head | |
505 | - | |
506 | - (c) Set used_idx to the idx value of used ring | |
507 | - | |
508 | - 2. Resubmit inflight DescStateSplit entries in order of their counter value | |
509 | - | |
510 | -For packed virtqueue, queue region can be implemented as: | |
511 | - | |
512 | -typedef struct DescStatePacked { | |
513 | - /* Indicate whether this descriptor is inflight or not. | |
514 | - * Only available for head-descriptor. */ | |
515 | - uint8_t inflight; | |
516 | - | |
517 | - /* Padding */ | |
518 | - uint8_t padding; | |
519 | - | |
520 | - /* Link to the next free entry */ | |
521 | - uint16_t next; | |
522 | - | |
523 | - /* Link to the last entry of descriptor list. | |
524 | - * Only available for head-descriptor. */ | |
525 | - uint16_t last; | |
526 | - | |
527 | - /* The length of descriptor list. | |
528 | - * Only available for head-descriptor. */ | |
529 | - uint16_t num; | |
530 | - | |
531 | - /* Used to preserve the order of fetching available descriptors. | |
532 | - * Only available for head-descriptor. */ | |
533 | - uint64_t counter; | |
534 | - | |
535 | - /* The buffer id */ | |
536 | - uint16_t id; | |
537 | - | |
538 | - /* The descriptor flags */ | |
539 | - uint16_t flags; | |
540 | - | |
541 | - /* The buffer length */ | |
542 | - uint32_t len; | |
543 | - | |
544 | - /* The buffer address */ | |
545 | - uint64_t addr; | |
546 | -} DescStatePacked; | |
547 | - | |
548 | -typedef struct QueueRegionPacked { | |
549 | - /* The feature flags of this region. Now it's initialized to 0. */ | |
550 | - uint64_t features; | |
551 | - | |
552 | - /* The version of this region. It's 1 currently. | |
553 | - * Zero value indicates an uninitialized buffer */ | |
554 | - uint16_t version; | |
555 | - | |
556 | - /* The size of DescStatePacked array. It's equal to the virtqueue | |
557 | - * size. Slave could get it from queue size field of VhostUserInflight. */ | |
558 | - uint16_t desc_num; | |
559 | - | |
560 | - /* The head of free DescStatePacked entry list */ | |
561 | - uint16_t free_head; | |
562 | - | |
563 | - /* The old head of free DescStatePacked entry list */ | |
564 | - uint16_t old_free_head; | |
565 | - | |
566 | - /* The used index of descriptor ring */ | |
567 | - uint16_t used_idx; | |
568 | - | |
569 | - /* The old used index of descriptor ring */ | |
570 | - uint16_t old_used_idx; | |
571 | - | |
572 | - /* Device ring wrap counter */ | |
573 | - uint8_t used_wrap_counter; | |
574 | - | |
575 | - /* The old device ring wrap counter */ | |
576 | - uint8_t old_used_wrap_counter; | |
577 | - | |
578 | - /* Padding */ | |
579 | - uint8_t padding[7]; | |
580 | - | |
581 | - /* Used to track the state of each descriptor fetched from descriptor ring */ | |
582 | - DescStatePacked desc[0]; | |
583 | -} QueueRegionPacked; | |
584 | - | |
585 | -To track inflight I/O, the queue region should be processed as follows: | |
586 | - | |
587 | -When receiving available buffers from the driver: | |
588 | - | |
589 | - 1. Get the next available descriptor entry from descriptor ring, d | |
590 | - | |
591 | - 2. If d is head descriptor, | |
592 | - | |
593 | - (a) Set desc[old_free_head].num to 0 | |
594 | - | |
595 | - (b) Set desc[old_free_head].counter to the value of global counter | |
596 | - | |
597 | - (c) Increase global counter by 1 | |
598 | - | |
599 | - (d) Set desc[old_free_head].inflight to 1 | |
600 | - | |
601 | - 3. If d is last descriptor, set desc[old_free_head].last to free_head | |
602 | - | |
603 | - 4. Increase desc[old_free_head].num by 1 | |
604 | - | |
605 | - 5. Set desc[free_head].addr, desc[free_head].len, desc[free_head].flags, | |
606 | - desc[free_head].id to d.addr, d.len, d.flags, d.id | |
607 | - | |
608 | - 6. Set free_head to desc[free_head].next | |
609 | - | |
610 | - 7. If d is last descriptor, set old_free_head to free_head | |
611 | - | |
612 | -When supplying used buffers to the driver: | |
613 | - | |
614 | - 1. Get corresponding used head-descriptor entry from descriptor ring, d | |
615 | - | |
616 | - 2. Get corresponding DescStatePacked entry, e | |
617 | - | |
618 | - 3. Set desc[e.last].next to free_head | |
619 | - | |
620 | - 4. Set free_head to the index of e | |
621 | - | |
622 | - 5. Steps 1,2,3,4 may be performed repeatedly if batching is possible | |
623 | - | |
624 | - 6. Increase used_idx by the size of the batch and update used_wrap_counter if needed | |
625 | - | |
626 | - 7. Update d.flags | |
627 | - | |
628 | - 8. Set the inflight field of each head DescStatePacked entry in the batch to 0 | |
629 | - | |
630 | - 9. Set old_free_head, old_used_idx, old_used_wrap_counter to free_head, used_idx, | |
631 | - used_wrap_counter | |
632 | - | |
633 | -When reconnecting: | |
634 | - | |
635 | - 1. If used_idx does not match old_used_idx (means the inflight field of DescStatePacked | |
636 | - entries in last batch may be incorrect), | |
637 | - | |
638 | - (a) Get the next descriptor ring entry through old_used_idx, d | |
639 | - | |
640 | - (b) Use old_used_wrap_counter to calculate the available flags | |
641 | - | |
642 | - (c) If d.flags is not equal to the calculated flags value (means slave has | |
643 | - submitted the buffer to guest driver before crash, so it has to commit the | |
644 | - in-progres update), set old_free_head, old_used_idx, old_used_wrap_counter | |
645 | - to free_head, used_idx, used_wrap_counter | |
646 | - | |
647 | - 2. Set free_head, used_idx, used_wrap_counter to old_free_head, old_used_idx, | |
648 | - old_used_wrap_counter (roll back any in-progress update) | |
649 | - | |
650 | - 3. Set the inflight field of each DescStatePacked entry in free list to 0 | |
651 | - | |
652 | - 4. Resubmit inflight DescStatePacked entries in order of their counter value | |
653 | - | |
654 | -Protocol features | |
655 | ------------------ | |
656 | - | |
657 | -#define VHOST_USER_PROTOCOL_F_MQ 0 | |
658 | -#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 | |
659 | -#define VHOST_USER_PROTOCOL_F_RARP 2 | |
660 | -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 | |
661 | -#define VHOST_USER_PROTOCOL_F_MTU 4 | |
662 | -#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 | |
663 | -#define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 | |
664 | -#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 | |
665 | -#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 | |
666 | -#define VHOST_USER_PROTOCOL_F_CONFIG 9 | |
667 | -#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 | |
668 | -#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 | |
669 | -#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 | |
670 | - | |
671 | -Master message types | |
672 | --------------------- | |
673 | - | |
674 | - * VHOST_USER_GET_FEATURES | |
675 | - | |
676 | - Id: 1 | |
677 | - Equivalent ioctl: VHOST_GET_FEATURES | |
678 | - Master payload: N/A | |
679 | - Slave payload: u64 | |
680 | - | |
681 | - Get from the underlying vhost implementation the features bitmask. | |
682 | - Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for | |
683 | - VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. | |
684 | - | |
685 | - * VHOST_USER_SET_FEATURES | |
686 | - | |
687 | - Id: 2 | |
688 | - Ioctl: VHOST_SET_FEATURES | |
689 | - Master payload: u64 | |
690 | - | |
691 | - Enable features in the underlying vhost implementation using a bitmask. | |
692 | - Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for | |
693 | - VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. | |
694 | - | |
695 | - * VHOST_USER_GET_PROTOCOL_FEATURES | |
696 | - | |
697 | - Id: 15 | |
698 | - Equivalent ioctl: VHOST_GET_FEATURES | |
699 | - Master payload: N/A | |
700 | - Slave payload: u64 | |
701 | - | |
702 | - Get the protocol feature bitmask from the underlying vhost implementation. | |
703 | - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in | |
704 | - VHOST_USER_GET_FEATURES. | |
705 | - Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support | |
706 | - this message even before VHOST_USER_SET_FEATURES was called. | |
707 | - | |
708 | - * VHOST_USER_SET_PROTOCOL_FEATURES | |
709 | - | |
710 | - Id: 16 | |
711 | - Ioctl: VHOST_SET_FEATURES | |
712 | - Master payload: u64 | |
713 | - | |
714 | - Enable protocol features in the underlying vhost implementation. | |
715 | - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in | |
716 | - VHOST_USER_GET_FEATURES. | |
717 | - Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support | |
718 | - this message even before VHOST_USER_SET_FEATURES was called. | |
719 | - | |
720 | - * VHOST_USER_SET_OWNER | |
721 | - | |
722 | - Id: 3 | |
723 | - Equivalent ioctl: VHOST_SET_OWNER | |
724 | - Master payload: N/A | |
725 | - | |
726 | - Issued when a new connection is established. It sets the current Master | |
727 | - as an owner of the session. This can be used on the Slave as a | |
728 | - "session start" flag. | |
729 | - | |
730 | - * VHOST_USER_RESET_OWNER | |
731 | - | |
732 | - Id: 4 | |
733 | - Master payload: N/A | |
734 | - | |
735 | - This is no longer used. Used to be sent to request disabling | |
736 | - all rings, but some clients interpreted it to also discard | |
737 | - connection state (this interpretation would lead to bugs). | |
738 | - It is recommended that clients either ignore this message, | |
739 | - or use it to disable all rings. | |
740 | - | |
741 | - * VHOST_USER_SET_MEM_TABLE | |
742 | - | |
743 | - Id: 5 | |
744 | - Equivalent ioctl: VHOST_SET_MEM_TABLE | |
745 | - Master payload: memory regions description | |
746 | - Slave payload: (postcopy only) memory regions description | |
747 | - | |
748 | - Sets the memory map regions on the slave so it can translate the vring | |
749 | - addresses. In the ancillary data there is an array of file descriptors | |
750 | - for each memory mapped region. The size and ordering of the fds matches | |
751 | - the number and ordering of memory regions. | |
752 | - | |
753 | - When VHOST_USER_POSTCOPY_LISTEN has been received, SET_MEM_TABLE replies with | |
754 | - the bases of the memory mapped regions to the master. The slave must | |
755 | - have mmap'd the regions but not yet accessed them and should not yet generate | |
756 | - a userfault event. Note NEED_REPLY_MASK is not set in this case. | |
757 | - QEMU will then reply back to the list of mappings with an empty | |
758 | - VHOST_USER_SET_MEM_TABLE as an acknowledgment; only upon reception of this | |
759 | - message may the guest start accessing the memory and generating faults. | |
760 | - | |
761 | - * VHOST_USER_SET_LOG_BASE | |
762 | - | |
763 | - Id: 6 | |
764 | - Equivalent ioctl: VHOST_SET_LOG_BASE | |
765 | - Master payload: u64 | |
766 | - Slave payload: N/A | |
767 | - | |
768 | - Sets logging shared memory space. | |
769 | - When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol | |
770 | - feature, the log memory fd is provided in the ancillary data of | |
771 | - VHOST_USER_SET_LOG_BASE message, the size and offset of shared | |
772 | - memory area provided in the message. | |
773 | - | |
774 | - | |
775 | - * VHOST_USER_SET_LOG_FD | |
776 | - | |
777 | - Id: 7 | |
778 | - Equivalent ioctl: VHOST_SET_LOG_FD | |
779 | - Master payload: N/A | |
780 | - | |
781 | - Sets the logging file descriptor, which is passed as ancillary data. | |
782 | - | |
783 | - * VHOST_USER_SET_VRING_NUM | |
784 | - | |
785 | - Id: 8 | |
786 | - Equivalent ioctl: VHOST_SET_VRING_NUM | |
787 | - Master payload: vring state description | |
788 | - | |
789 | - Set the size of the queue. | |
790 | - | |
791 | - * VHOST_USER_SET_VRING_ADDR | |
792 | - | |
793 | - Id: 9 | |
794 | - Equivalent ioctl: VHOST_SET_VRING_ADDR | |
795 | - Master payload: vring address description | |
796 | - Slave payload: N/A | |
797 | - | |
798 | - Sets the addresses of the different aspects of the vring. | |
799 | - | |
800 | - * VHOST_USER_SET_VRING_BASE | |
801 | - | |
802 | - Id: 10 | |
803 | - Equivalent ioctl: VHOST_SET_VRING_BASE | |
804 | - Master payload: vring state description | |
805 | - | |
806 | - Sets the base offset in the available vring. | |
807 | - | |
808 | - * VHOST_USER_GET_VRING_BASE | |
809 | - | |
810 | - Id: 11 | |
811 | - Equivalent ioctl: VHOST_USER_GET_VRING_BASE | |
812 | - Master payload: vring state description | |
813 | - Slave payload: vring state description | |
814 | - | |
815 | - Get the available vring base offset. | |
816 | - | |
817 | - * VHOST_USER_SET_VRING_KICK | |
818 | - | |
819 | - Id: 12 | |
820 | - Equivalent ioctl: VHOST_SET_VRING_KICK | |
821 | - Master payload: u64 | |
822 | - | |
823 | - Set the event file descriptor for adding buffers to the vring. It | |
824 | - is passed in the ancillary data. | |
825 | - Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
826 | - invalid FD flag. This flag is set when there is no file descriptor | |
827 | - in the ancillary data. This signals that polling should be used | |
828 | - instead of waiting for a kick. | |
829 | - | |
830 | - * VHOST_USER_SET_VRING_CALL | |
831 | - | |
832 | - Id: 13 | |
833 | - Equivalent ioctl: VHOST_SET_VRING_CALL | |
834 | - Master payload: u64 | |
835 | - | |
836 | - Set the event file descriptor to signal when buffers are used. It | |
837 | - is passed in the ancillary data. | |
838 | - Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
839 | - invalid FD flag. This flag is set when there is no file descriptor | |
840 | - in the ancillary data. This signals that polling will be used | |
841 | - instead of waiting for the call. | |
842 | - | |
843 | - * VHOST_USER_SET_VRING_ERR | |
844 | - | |
845 | - Id: 14 | |
846 | - Equivalent ioctl: VHOST_SET_VRING_ERR | |
847 | - Master payload: u64 | |
848 | - | |
849 | - Set the event file descriptor to signal when error occurs. It | |
850 | - is passed in the ancillary data. | |
851 | - Bits (0-7) of the payload contain the vring index. Bit 8 is the | |
852 | - invalid FD flag. This flag is set when there is no file descriptor | |
853 | - in the ancillary data. | |
854 | - | |
855 | - * VHOST_USER_GET_QUEUE_NUM | |
856 | - | |
857 | - Id: 17 | |
858 | - Equivalent ioctl: N/A | |
859 | - Master payload: N/A | |
860 | - Slave payload: u64 | |
861 | - | |
862 | - Query how many queues the backend supports. This request should be | |
863 | - sent only when VHOST_USER_PROTOCOL_F_MQ is set in queried protocol | |
864 | - features by VHOST_USER_GET_PROTOCOL_FEATURES. | |
865 | - | |
866 | - * VHOST_USER_SET_VRING_ENABLE | |
867 | - | |
868 | - Id: 18 | |
869 | - Equivalent ioctl: N/A | |
870 | - Master payload: vring state description | |
871 | - | |
872 | - Signal slave to enable or disable corresponding vring. | |
873 | - This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES | |
874 | - has been negotiated. | |
875 | - | |
876 | - * VHOST_USER_SEND_RARP | |
877 | - | |
878 | - Id: 19 | |
879 | - Equivalent ioctl: N/A | |
880 | - Master payload: u64 | |
881 | - | |
882 | - Ask vhost user backend to broadcast a fake RARP to notify the migration | |
883 | - is terminated for guest that does not support GUEST_ANNOUNCE. | |
884 | - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in | |
885 | - VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP | |
886 | - is present in VHOST_USER_GET_PROTOCOL_FEATURES. | |
887 | - The first 6 bytes of the payload contain the mac address of the guest to | |
888 | - allow the vhost user backend to construct and broadcast the fake RARP. | |
889 | - | |
890 | - * VHOST_USER_NET_SET_MTU | |
891 | - | |
892 | - Id: 20 | |
893 | - Equivalent ioctl: N/A | |
894 | - Master payload: u64 | |
895 | - | |
896 | - Set host MTU value exposed to the guest. | |
897 | - This request should be sent only when VIRTIO_NET_F_MTU feature has been | |
898 | - successfully negotiated, VHOST_USER_F_PROTOCOL_FEATURES is present in | |
899 | - VHOST_USER_GET_FEATURES and protocol feature bit | |
900 | - VHOST_USER_PROTOCOL_F_NET_MTU is present in | |
901 | - VHOST_USER_GET_PROTOCOL_FEATURES. | |
902 | - If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond | |
903 | - with zero in case the specified MTU is valid, or non-zero otherwise. | |
904 | - | |
905 | - * VHOST_USER_SET_SLAVE_REQ_FD | |
906 | - | |
907 | - Id: 21 | |
908 | - Equivalent ioctl: N/A | |
909 | - Master payload: N/A | |
910 | - | |
911 | - Set the socket file descriptor for slave initiated requests. It is passed | |
912 | - in the ancillary data. | |
913 | - This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES | |
914 | - has been negotiated, and protocol feature bit VHOST_USER_PROTOCOL_F_SLAVE_REQ | |
915 | - bit is present in VHOST_USER_GET_PROTOCOL_FEATURES. | |
916 | - If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond | |
917 | - with zero for success, non-zero otherwise. | |
918 | - | |
919 | - * VHOST_USER_IOTLB_MSG | |
920 | - | |
921 | - Id: 22 | |
922 | - Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type) | |
923 | - Master payload: struct vhost_iotlb_msg | |
924 | - Slave payload: u64 | |
925 | - | |
926 | - Send IOTLB messages with struct vhost_iotlb_msg as payload. | |
927 | - Master sends such requests to update and invalidate entries in the device | |
928 | - IOTLB. The slave has to acknowledge the request with sending zero as u64 | |
929 | - payload for success, non-zero otherwise. | |
930 | - This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature | |
931 | - has been successfully negotiated. | |
932 | - | |
933 | - * VHOST_USER_SET_VRING_ENDIAN | |
934 | - | |
935 | - Id: 23 | |
936 | - Equivalent ioctl: VHOST_SET_VRING_ENDIAN | |
937 | - Master payload: vring state description | |
938 | - | |
939 | - Set the endianness of a VQ for legacy devices. Little-endian is indicated | |
940 | - with state.num set to 0 and big-endian is indicated with state.num set | |
941 | - to 1. Other values are invalid. | |
942 | - This request should be sent only when VHOST_USER_PROTOCOL_F_CROSS_ENDIAN | |
943 | - has been negotiated. | |
944 | - Backends that negotiated this feature should handle both endiannesses | |
945 | - and expect this message once (per VQ) during device configuration | |
946 | - (ie. before the master starts the VQ). | |
947 | - | |
948 | - * VHOST_USER_GET_CONFIG | |
949 | - | |
950 | - Id: 24 | |
951 | - Equivalent ioctl: N/A | |
952 | - Master payload: virtio device config space | |
953 | - Slave payload: virtio device config space | |
954 | - | |
955 | - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is | |
956 | - submitted by the vhost-user master to fetch the contents of the virtio | |
957 | - device configuration space, vhost-user slave's payload size MUST match | |
958 | - master's request, vhost-user slave uses zero length of payload to | |
959 | - indicate an error to vhost-user master. The vhost-user master may | |
960 | - cache the contents to avoid repeated VHOST_USER_GET_CONFIG calls. | |
961 | - | |
962 | -* VHOST_USER_SET_CONFIG | |
963 | - | |
964 | - Id: 25 | |
965 | - Equivalent ioctl: N/A | |
966 | - Master payload: virtio device config space | |
967 | - Slave payload: N/A | |
968 | - | |
969 | - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is | |
970 | - submitted by the vhost-user master when the Guest changes the virtio | |
971 | - device configuration space and also can be used for live migration | |
972 | - on the destination host. The vhost-user slave must check the flags | |
973 | - field, and slaves MUST NOT accept SET_CONFIG for read-only | |
974 | - configuration space fields unless the live migration bit is set. | |
975 | - | |
976 | -* VHOST_USER_CREATE_CRYPTO_SESSION | |
977 | - | |
978 | - Id: 26 | |
979 | - Equivalent ioctl: N/A | |
980 | - Master payload: crypto session description | |
981 | - Slave payload: crypto session description | |
982 | - | |
983 | - Create a session for crypto operation. The server side must return the | |
984 | - session id, 0 or positive for success, negative for failure. | |
985 | - This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION | |
986 | - feature has been successfully negotiated. | |
987 | - It's a required feature for crypto devices. | |
988 | - | |
989 | -* VHOST_USER_CLOSE_CRYPTO_SESSION | |
990 | - | |
991 | - Id: 27 | |
992 | - Equivalent ioctl: N/A | |
993 | - Master payload: u64 | |
994 | - | |
995 | - Close a session for crypto operation which was previously | |
996 | - created by VHOST_USER_CREATE_CRYPTO_SESSION. | |
997 | - This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION | |
998 | - feature has been successfully negotiated. | |
999 | - It's a required feature for crypto devices. | |
1000 | - | |
1001 | - * VHOST_USER_POSTCOPY_ADVISE | |
1002 | - Id: 28 | |
1003 | - Master payload: N/A | |
1004 | - Slave payload: userfault fd | |
1005 | - | |
1006 | - When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, the | |
1007 | - master advises slave that a migration with postcopy enabled is underway, | |
1008 | - the slave must open a userfaultfd for later use. | |
1009 | - Note that at this stage the migration is still in precopy mode. | |
1010 | - | |
1011 | - * VHOST_USER_POSTCOPY_LISTEN | |
1012 | - Id: 29 | |
1013 | - Master payload: N/A | |
1014 | - | |
1015 | - Master advises slave that a transition to postcopy mode has happened. | |
1016 | - The slave must ensure that shared memory is registered with userfaultfd | |
1017 | - to cause faulting of non-present pages. | |
1018 | - | |
1019 | - This is always sent sometime after a VHOST_USER_POSTCOPY_ADVISE, and | |
1020 | - thus only when VHOST_USER_PROTOCOL_F_PAGEFAULT is supported. | |
1021 | - | |
1022 | - * VHOST_USER_POSTCOPY_END | |
1023 | - Id: 30 | |
1024 | - Slave payload: u64 | |
1025 | - | |
1026 | - Master advises that postcopy migration has now completed. The | |
1027 | - slave must disable the userfaultfd. The response is an acknowledgement | |
1028 | - only. | |
1029 | - When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, this message | |
1030 | - is sent at the end of the migration, after VHOST_USER_POSTCOPY_LISTEN | |
1031 | - was previously sent. | |
1032 | - The value returned is an error indication; 0 is success. | |
1033 | - | |
1034 | - * VHOST_USER_GET_INFLIGHT_FD | |
1035 | - Id: 31 | |
1036 | - Equivalent ioctl: N/A | |
1037 | - Master payload: inflight description | |
1038 | - | |
1039 | - When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been | |
1040 | - successfully negotiated, this message is submitted by master to get | |
1041 | - a shared buffer from slave. The shared buffer will be used to track | |
1042 | - inflight I/O by slave. QEMU should retrieve a new one when vm reset. | |
1043 | - | |
1044 | - * VHOST_USER_SET_INFLIGHT_FD | |
1045 | - Id: 32 | |
1046 | - Equivalent ioctl: N/A | |
1047 | - Master payload: inflight description | |
1048 | - | |
1049 | - When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been | |
1050 | - successfully negotiated, this message is submitted by master to send | |
1051 | - the shared inflight buffer back to slave so that slave could get | |
1052 | - inflight I/O after a crash or restart. | |
1053 | - | |
1054 | -Slave message types | |
1055 | -------------------- | |
1056 | - | |
1057 | - * VHOST_USER_SLAVE_IOTLB_MSG | |
1058 | - | |
1059 | - Id: 1 | |
1060 | - Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type) | |
1061 | - Slave payload: struct vhost_iotlb_msg | |
1062 | - Master payload: N/A | |
1063 | - | |
1064 | - Send IOTLB messages with struct vhost_iotlb_msg as payload. | |
1065 | - Slave sends such requests to notify of an IOTLB miss, or an IOTLB | |
1066 | - access failure. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, | |
1067 | - and slave set the VHOST_USER_NEED_REPLY flag, master must respond with | |
1068 | - zero when operation is successfully completed, or non-zero otherwise. | |
1069 | - This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature | |
1070 | - has been successfully negotiated. | |
1071 | - | |
1072 | -* VHOST_USER_SLAVE_CONFIG_CHANGE_MSG | |
1073 | - | |
1074 | - Id: 2 | |
1075 | - Equivalent ioctl: N/A | |
1076 | - Slave payload: N/A | |
1077 | - Master payload: N/A | |
1078 | - | |
1079 | - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, vhost-user slave sends | |
1080 | - such messages to notify that the virtio device's configuration space has | |
1081 | - changed, for those host devices which can support such feature, host | |
1082 | - driver can send VHOST_USER_GET_CONFIG message to slave to get the latest | |
1083 | - content. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, and slave set | |
1084 | - the VHOST_USER_NEED_REPLY flag, master must respond with zero when | |
1085 | - operation is successfully completed, or non-zero otherwise. | |
1086 | - | |
1087 | - * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG | |
1088 | - | |
1089 | - Id: 3 | |
1090 | - Equivalent ioctl: N/A | |
1091 | - Slave payload: vring area description | |
1092 | - Master payload: N/A | |
1093 | - | |
1094 | - Sets host notifier for a specified queue. The queue index is contained | |
1095 | - in the u64 field of the vring area description. The host notifier is | |
1096 | - described by the file descriptor (typically it's a VFIO device fd) which | |
1097 | - is passed as ancillary data and the size (which is mmap size and should | |
1098 | - be the same as host page size) and offset (which is mmap offset) carried | |
1099 | - in the vring area description. QEMU can mmap the file descriptor based | |
1100 | - on the size and offset to get a memory range. Registering a host notifier | |
1101 | - means mapping this memory range to the VM as the specified queue's notify | |
1102 | - MMIO region. Slave sends this request to tell QEMU to de-register the | |
1103 | - existing notifier if any and register the new notifier if the request is | |
1104 | - sent with a file descriptor. | |
1105 | - This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | |
1106 | - protocol feature has been successfully negotiated. | |
1107 | - | |
1108 | -VHOST_USER_PROTOCOL_F_REPLY_ACK: | |
1109 | -------------------------------- | |
1110 | -The original vhost-user specification only demands replies for certain | |
1111 | -commands. This differs from the vhost protocol implementation where commands | |
1112 | -are sent over an ioctl() call and block until the client has completed. | |
1113 | - | |
1114 | -With this protocol extension negotiated, the sender (QEMU) can set the | |
1115 | -"need_reply" [Bit 3] flag to any command. This indicates that | |
1116 | -the client MUST respond with a Payload VhostUserMsg indicating success or | |
1117 | -failure. The payload should be set to zero on success or non-zero on failure, | |
1118 | -unless the message already has an explicit reply body. | |
1119 | - | |
1120 | -The response payload gives QEMU a deterministic indication of the result | |
1121 | -of the command. Today, QEMU is expected to terminate the main vhost-user | |
1122 | -loop upon receiving such errors. In future, qemu could be taught to be more | |
1123 | -resilient for selective requests. | |
1124 | - | |
1125 | -For the message types that already solicit a reply from the client, the | |
1126 | -presence of VHOST_USER_PROTOCOL_F_REPLY_ACK or need_reply bit being set brings | |
1127 | -no behavioural change. (See the 'Communication' section for details.) | |
1128 | - | |
1129 | -Backend program conventions | |
1130 | ---------------------------- | |
1131 | - | |
1132 | -vhost-user backends can provide various devices & services and may | |
1133 | -need to be configured manually depending on the use case. However, it | |
1134 | -is a good idea to follow the conventions listed here when | |
1135 | -possible. Users, QEMU or libvirt, can then rely on some common | |
1136 | -behaviour to avoid heterogenous configuration and management of the | |
1137 | -backend programs and facilitate interoperability. | |
1138 | - | |
1139 | -Each backend installed on a host system should come with at least one | |
1140 | -JSON file that conforms to the vhost-user.json schema. Each file | |
1141 | -informs the management applications about the backend type, and binary | |
1142 | -location. In addition, it defines rules for management apps for | |
1143 | -picking the highest priority backend when multiple match the search | |
1144 | -criteria (see @VhostUserBackend documentation in the schema file). | |
1145 | - | |
1146 | -If the backend is not capable of enabling a requested feature on the | |
1147 | -host (such as 3D acceleration with virgl), or the initialization | |
1148 | -failed, the backend should fail to start early and exit with a status | |
1149 | -!= 0. It may also print a message to stderr for further details. | |
1150 | - | |
1151 | -The backend program must not daemonize itself, but it may be | |
1152 | -daemonized by the management layer. It may also have a restricted | |
1153 | -access to the system. | |
1154 | - | |
1155 | -File descriptors 0, 1 and 2 will exist, and have regular | |
1156 | -stdin/stdout/stderr usage (they may have been redirected to /dev/null | |
1157 | -by the management layer, or to a log handler). | |
1158 | - | |
1159 | -The backend program must end (as quickly and cleanly as possible) when | |
1160 | -the SIGTERM signal is received. Eventually, it may receive SIGKILL by | |
1161 | -the management layer after a few seconds. | |
1162 | - | |
1163 | -The following command line options have an expected behaviour. They | |
1164 | -are mandatory, unless explicitly said differently: | |
1165 | - | |
1166 | -* --socket-path=PATH | |
1167 | - | |
1168 | -This option specify the location of the vhost-user Unix domain socket. | |
1169 | -It is incompatible with --fd. | |
1170 | - | |
1171 | -* --fd=FDNUM | |
1172 | - | |
1173 | -When this argument is given, the backend program is started with the | |
1174 | -vhost-user socket as file descriptor FDNUM. It is incompatible with | |
1175 | ---socket-path. | |
1176 | - | |
1177 | -* --print-capabilities | |
1178 | - | |
1179 | -Output to stdout the backend capabilities in JSON format, and then | |
1180 | -exit successfully. Other options and arguments should be ignored, and | |
1181 | -the backend program should not perform its normal function. The | |
1182 | -capabilities can be reported dynamically depending on the host | |
1183 | -capabilities. | |
1184 | - | |
1185 | -The JSON output is described in the vhost-user.json schema, by | |
1186 | -@VHostUserBackendCapabilities. Example: | |
1187 | -{ | |
1188 | - "type": "foo", | |
1189 | - "features": [ | |
1190 | - "feature-a", | |
1191 | - "feature-b" | |
1192 | - ] | |
1193 | -} | |
1194 | - | |
1195 | -vhost-user-input | |
1196 | ----------------- | |
1197 | - | |
1198 | -Command line options: | |
1199 | - | |
1200 | -* --evdev-path=PATH (optional) | |
1201 | - | |
1202 | -Specify the linux input device. | |
1203 | - | |
1204 | -* --no-grab (optional) | |
1205 | - | |
1206 | -Do no request exclusive access to the input device. | |
1207 | - | |
1208 | -vhost-user-gpu | |
1209 | --------------- | |
1210 | - | |
1211 | -Command line options: | |
1212 | - | |
1213 | -* --render-node=PATH (optional) | |
1214 | - | |
1215 | -Specify the GPU DRM render node. | |
1216 | - | |
1217 | -* --virgl (optional) | |
1218 | - | |
1219 | -Enable virgl rendering support. |
@@ -37,14 +37,7 @@ | ||
37 | 37 | #include "hw/pci/pci_bus.h" |
38 | 38 | #include "qapi/error.h" |
39 | 39 | #include "qom/qom-qobject.h" |
40 | - | |
41 | -//#define DEBUG | |
42 | - | |
43 | -#ifdef DEBUG | |
44 | -# define ACPI_PCIHP_DPRINTF(format, ...) printf(format, ## __VA_ARGS__) | |
45 | -#else | |
46 | -# define ACPI_PCIHP_DPRINTF(format, ...) do { } while (0) | |
47 | -#endif | |
40 | +#include "trace.h" | |
48 | 41 | |
49 | 42 | #define ACPI_PCIHP_ADDR 0xae00 |
50 | 43 | #define ACPI_PCIHP_SIZE 0x0014 |
@@ -159,6 +152,8 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo | ||
159 | 152 | int slot = ctz32(slots); |
160 | 153 | PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel); |
161 | 154 | |
155 | + trace_acpi_pci_eject_slot(bsel, slot); | |
156 | + | |
162 | 157 | if (!bus) { |
163 | 158 | return; |
164 | 159 | } |
@@ -270,6 +265,8 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, | ||
270 | 265 | void acpi_pcihp_device_unplug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, |
271 | 266 | DeviceState *dev, Error **errp) |
272 | 267 | { |
268 | + trace_acpi_pci_unplug(PCI_SLOT(PCI_DEVICE(dev)->devfn), | |
269 | + acpi_pcihp_get_bsel(pci_get_bus(PCI_DEVICE(dev)))); | |
273 | 270 | object_property_set_bool(OBJECT(dev), false, "realized", NULL); |
274 | 271 | } |
275 | 272 |
@@ -280,6 +277,9 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, | ||
280 | 277 | PCIDevice *pdev = PCI_DEVICE(dev); |
281 | 278 | int slot = PCI_SLOT(pdev->devfn); |
282 | 279 | int bsel = acpi_pcihp_get_bsel(pci_get_bus(pdev)); |
280 | + | |
281 | + trace_acpi_pci_unplug_request(bsel, slot); | |
282 | + | |
283 | 283 | if (bsel < 0) { |
284 | 284 | error_setg(errp, "Unsupported bus. Bus doesn't have property '" |
285 | 285 | ACPI_PCIHP_PROP_BSEL "' set"); |
@@ -306,23 +306,23 @@ static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size) | ||
306 | 306 | if (!s->legacy_piix) { |
307 | 307 | s->acpi_pcihp_pci_status[bsel].up = 0; |
308 | 308 | } |
309 | - ACPI_PCIHP_DPRINTF("pci_up_read %" PRIu32 "\n", val); | |
309 | + trace_acpi_pci_up_read(val); | |
310 | 310 | break; |
311 | 311 | case PCI_DOWN_BASE: |
312 | 312 | val = s->acpi_pcihp_pci_status[bsel].down; |
313 | - ACPI_PCIHP_DPRINTF("pci_down_read %" PRIu32 "\n", val); | |
313 | + trace_acpi_pci_down_read(val); | |
314 | 314 | break; |
315 | 315 | case PCI_EJ_BASE: |
316 | 316 | /* No feature defined yet */ |
317 | - ACPI_PCIHP_DPRINTF("pci_features_read %" PRIu32 "\n", val); | |
317 | + trace_acpi_pci_features_read(val); | |
318 | 318 | break; |
319 | 319 | case PCI_RMV_BASE: |
320 | 320 | val = s->acpi_pcihp_pci_status[bsel].hotplug_enable; |
321 | - ACPI_PCIHP_DPRINTF("pci_rmv_read %" PRIu32 "\n", val); | |
321 | + trace_acpi_pci_rmv_read(val); | |
322 | 322 | break; |
323 | 323 | case PCI_SEL_BASE: |
324 | 324 | val = s->hotplug_select; |
325 | - ACPI_PCIHP_DPRINTF("pci_sel_read %" PRIu32 "\n", val); | |
325 | + trace_acpi_pci_sel_read(val); | |
326 | 326 | default: |
327 | 327 | break; |
328 | 328 | } |
@@ -340,13 +340,11 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data, | ||
340 | 340 | break; |
341 | 341 | } |
342 | 342 | acpi_pcihp_eject_slot(s, s->hotplug_select, data); |
343 | - ACPI_PCIHP_DPRINTF("pciej write %" HWADDR_PRIx " <== %" PRIu64 "\n", | |
344 | - addr, data); | |
343 | + trace_acpi_pci_ej_write(addr, data); | |
345 | 344 | break; |
346 | 345 | case PCI_SEL_BASE: |
347 | 346 | s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data; |
348 | - ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n", | |
349 | - addr, data); | |
347 | + trace_acpi_pci_sel_write(addr, data); | |
350 | 348 | default: |
351 | 349 | break; |
352 | 350 | } |
@@ -39,14 +39,7 @@ | ||
39 | 39 | #include "hw/acpi/acpi_dev_interface.h" |
40 | 40 | #include "hw/xen/xen.h" |
41 | 41 | #include "qom/cpu.h" |
42 | - | |
43 | -//#define DEBUG | |
44 | - | |
45 | -#ifdef DEBUG | |
46 | -# define PIIX4_DPRINTF(format, ...) printf(format, ## __VA_ARGS__) | |
47 | -#else | |
48 | -# define PIIX4_DPRINTF(format, ...) do { } while (0) | |
49 | -#endif | |
42 | +#include "trace.h" | |
50 | 43 | |
51 | 44 | #define GPE_BASE 0xafe0 |
52 | 45 | #define GPE_LEN 4 |
@@ -583,7 +576,7 @@ static uint64_t gpe_readb(void *opaque, hwaddr addr, unsigned width) | ||
583 | 576 | PIIX4PMState *s = opaque; |
584 | 577 | uint32_t val = acpi_gpe_ioport_readb(&s->ar, addr); |
585 | 578 | |
586 | - PIIX4_DPRINTF("gpe read %" HWADDR_PRIx " == %" PRIu32 "\n", addr, val); | |
579 | + trace_piix4_gpe_readb(addr, width, val); | |
587 | 580 | return val; |
588 | 581 | } |
589 | 582 |
@@ -592,10 +585,9 @@ static void gpe_writeb(void *opaque, hwaddr addr, uint64_t val, | ||
592 | 585 | { |
593 | 586 | PIIX4PMState *s = opaque; |
594 | 587 | |
588 | + trace_piix4_gpe_writeb(addr, width, val); | |
595 | 589 | acpi_gpe_ioport_writeb(&s->ar, addr, val); |
596 | 590 | acpi_update_sci(&s->ar, s->irq); |
597 | - | |
598 | - PIIX4_DPRINTF("gpe write %" HWADDR_PRIx " <== %" PRIu64 "\n", addr, val); | |
599 | 591 | } |
600 | 592 | |
601 | 593 | static const MemoryRegionOps piix4_gpe_ops = { |
@@ -31,6 +31,22 @@ cpuhp_acpi_ejecting_cpu(uint32_t idx) "0x%"PRIx32 | ||
31 | 31 | cpuhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "idx[0x%"PRIx32"] OST EVENT: 0x%"PRIx32 |
32 | 32 | cpuhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "idx[0x%"PRIx32"] OST STATUS: 0x%"PRIx32 |
33 | 33 | |
34 | +# pcihp.c | |
35 | +acpi_pci_eject_slot(unsigned bsel, unsigned slot) "bsel: %u slot: %u" | |
36 | +acpi_pci_unplug(int bsel, int slot) "bsel: %d slot: %d" | |
37 | +acpi_pci_unplug_request(int bsel, int slot) "bsel: %d slot: %d" | |
38 | +acpi_pci_up_read(uint32_t val) "%" PRIu32 | |
39 | +acpi_pci_down_read(uint32_t val) "%" PRIu32 | |
40 | +acpi_pci_features_read(uint32_t val) "%" PRIu32 | |
41 | +acpi_pci_rmv_read(uint32_t val) "%" PRIu32 | |
42 | +acpi_pci_sel_read(uint32_t val) "%" PRIu32 | |
43 | +acpi_pci_ej_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64 | |
44 | +acpi_pci_sel_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64 | |
45 | + | |
46 | +# piix4.c | |
47 | +piix4_gpe_readb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d ==> 0x%" PRIx64 | |
48 | +piix4_gpe_writeb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d <== 0x%" PRIx64 | |
49 | + | |
34 | 50 | # tco.c |
35 | 51 | tco_timer_reload(int ticks, int msec) "ticks=%d (%d ms)" |
36 | 52 | tco_timer_expired(int timeouts_no, bool strap, bool no_reboot) "timeouts_no=%d no_reboot=%d/%d" |
@@ -40,6 +40,7 @@ | ||
40 | 40 | #include "hw/loader.h" |
41 | 41 | #include "hw/hw.h" |
42 | 42 | #include "hw/acpi/aml-build.h" |
43 | +#include "hw/acpi/pci.h" | |
43 | 44 | #include "hw/pci/pcie_host.h" |
44 | 45 | #include "hw/pci/pci.h" |
45 | 46 | #include "hw/arm/virt.h" |
@@ -546,25 +547,20 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) | ||
546 | 547 | } |
547 | 548 | |
548 | 549 | static void |
549 | -build_mcfg(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) | |
550 | +build_mcfg(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) | |
550 | 551 | { |
551 | 552 | AcpiTableMcfg *mcfg; |
552 | - const MemMapEntry *memmap = vms->memmap; | |
553 | - int ecam_id = VIRT_ECAM_ID(vms->highmem_ecam); | |
554 | 553 | int len = sizeof(*mcfg) + sizeof(mcfg->allocation[0]); |
555 | - int mcfg_start = table_data->len; | |
556 | 554 | |
557 | 555 | mcfg = acpi_data_push(table_data, len); |
558 | - mcfg->allocation[0].address = cpu_to_le64(memmap[ecam_id].base); | |
556 | + mcfg->allocation[0].address = cpu_to_le64(info->base); | |
559 | 557 | |
560 | 558 | /* Only a single allocation so no need to play with segments */ |
561 | 559 | mcfg->allocation[0].pci_segment = cpu_to_le16(0); |
562 | 560 | mcfg->allocation[0].start_bus_number = 0; |
563 | - mcfg->allocation[0].end_bus_number = | |
564 | - PCIE_MMCFG_BUS(memmap[ecam_id].size - 1); | |
561 | + mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1); | |
565 | 562 | |
566 | - build_header(linker, table_data, (void *)(table_data->data + mcfg_start), | |
567 | - "MCFG", table_data->len - mcfg_start, 1, NULL, NULL); | |
563 | + build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL); | |
568 | 564 | } |
569 | 565 | |
570 | 566 | /* GTDT */ |
@@ -803,7 +799,13 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) | ||
803 | 799 | build_gtdt(tables_blob, tables->linker, vms); |
804 | 800 | |
805 | 801 | acpi_add_table(table_offsets, tables_blob); |
806 | - build_mcfg(tables_blob, tables->linker, vms); | |
802 | + { | |
803 | + AcpiMcfgInfo mcfg = { | |
804 | + .base = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].base, | |
805 | + .size = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].size, | |
806 | + }; | |
807 | + build_mcfg(tables_blob, tables->linker, &mcfg); | |
808 | + } | |
807 | 809 | |
808 | 810 | acpi_add_table(table_offsets, tables_blob); |
809 | 811 | build_spcr(tables_blob, tables->linker, vms); |
@@ -103,7 +103,7 @@ const VhostDevConfigOps blk_ops = { | ||
103 | 103 | .vhost_dev_config_notifier = vhost_user_blk_handle_config_change, |
104 | 104 | }; |
105 | 105 | |
106 | -static void vhost_user_blk_start(VirtIODevice *vdev) | |
106 | +static int vhost_user_blk_start(VirtIODevice *vdev) | |
107 | 107 | { |
108 | 108 | VHostUserBlk *s = VHOST_USER_BLK(vdev); |
109 | 109 | BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); |
@@ -112,13 +112,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev) | ||
112 | 112 | |
113 | 113 | if (!k->set_guest_notifiers) { |
114 | 114 | error_report("binding does not support guest notifiers"); |
115 | - return; | |
115 | + return -ENOSYS; | |
116 | 116 | } |
117 | 117 | |
118 | 118 | ret = vhost_dev_enable_notifiers(&s->dev, vdev); |
119 | 119 | if (ret < 0) { |
120 | 120 | error_report("Error enabling host notifiers: %d", -ret); |
121 | - return; | |
121 | + return ret; | |
122 | 122 | } |
123 | 123 | |
124 | 124 | ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); |
@@ -157,12 +157,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev) | ||
157 | 157 | vhost_virtqueue_mask(&s->dev, vdev, i, false); |
158 | 158 | } |
159 | 159 | |
160 | - return; | |
160 | + return ret; | |
161 | 161 | |
162 | 162 | err_guest_notifiers: |
163 | 163 | k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); |
164 | 164 | err_host_notifiers: |
165 | 165 | vhost_dev_disable_notifiers(&s->dev, vdev); |
166 | + return ret; | |
166 | 167 | } |
167 | 168 | |
168 | 169 | static void vhost_user_blk_stop(VirtIODevice *vdev) |
@@ -190,18 +191,28 @@ static void vhost_user_blk_stop(VirtIODevice *vdev) | ||
190 | 191 | static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) |
191 | 192 | { |
192 | 193 | VHostUserBlk *s = VHOST_USER_BLK(vdev); |
193 | - bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK; | |
194 | + bool should_start = vdev->started; | |
195 | + int ret; | |
194 | 196 | |
195 | 197 | if (!vdev->vm_running) { |
196 | 198 | should_start = false; |
197 | 199 | } |
198 | 200 | |
201 | + if (!s->connected) { | |
202 | + return; | |
203 | + } | |
204 | + | |
199 | 205 | if (s->dev.started == should_start) { |
200 | 206 | return; |
201 | 207 | } |
202 | 208 | |
203 | 209 | if (should_start) { |
204 | - vhost_user_blk_start(vdev); | |
210 | + ret = vhost_user_blk_start(vdev); | |
211 | + if (ret < 0) { | |
212 | + error_report("vhost-user-blk: vhost start failed: %s", | |
213 | + strerror(-ret)); | |
214 | + qemu_chr_fe_disconnect(&s->chardev); | |
215 | + } | |
205 | 216 | } else { |
206 | 217 | vhost_user_blk_stop(vdev); |
207 | 218 | } |
@@ -237,10 +248,13 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev, | ||
237 | 248 | static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) |
238 | 249 | { |
239 | 250 | VHostUserBlk *s = VHOST_USER_BLK(vdev); |
240 | - int i; | |
251 | + int i, ret; | |
241 | 252 | |
242 | - if (!(virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && | |
243 | - !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1))) { | |
253 | + if (!vdev->start_on_kick) { | |
254 | + return; | |
255 | + } | |
256 | + | |
257 | + if (!s->connected) { | |
244 | 258 | return; |
245 | 259 | } |
246 | 260 |
@@ -251,7 +265,13 @@ static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) | ||
251 | 265 | /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start |
252 | 266 | * vhost here instead of waiting for .set_status(). |
253 | 267 | */ |
254 | - vhost_user_blk_start(vdev); | |
268 | + ret = vhost_user_blk_start(vdev); | |
269 | + if (ret < 0) { | |
270 | + error_report("vhost-user-blk: vhost start failed: %s", | |
271 | + strerror(-ret)); | |
272 | + qemu_chr_fe_disconnect(&s->chardev); | |
273 | + return; | |
274 | + } | |
255 | 275 | |
256 | 276 | /* Kick right away to begin processing requests already in vring */ |
257 | 277 | for (i = 0; i < s->dev.nvqs; i++) { |
@@ -271,11 +291,103 @@ static void vhost_user_blk_reset(VirtIODevice *vdev) | ||
271 | 291 | vhost_dev_free_inflight(s->inflight); |
272 | 292 | } |
273 | 293 | |
294 | +static int vhost_user_blk_connect(DeviceState *dev) | |
295 | +{ | |
296 | + VirtIODevice *vdev = VIRTIO_DEVICE(dev); | |
297 | + VHostUserBlk *s = VHOST_USER_BLK(vdev); | |
298 | + int ret = 0; | |
299 | + | |
300 | + if (s->connected) { | |
301 | + return 0; | |
302 | + } | |
303 | + s->connected = true; | |
304 | + | |
305 | + s->dev.nvqs = s->num_queues; | |
306 | + s->dev.vqs = s->vqs; | |
307 | + s->dev.vq_index = 0; | |
308 | + s->dev.backend_features = 0; | |
309 | + | |
310 | + vhost_dev_set_config_notifier(&s->dev, &blk_ops); | |
311 | + | |
312 | + ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0); | |
313 | + if (ret < 0) { | |
314 | + error_report("vhost-user-blk: vhost initialization failed: %s", | |
315 | + strerror(-ret)); | |
316 | + return ret; | |
317 | + } | |
318 | + | |
319 | + /* restore vhost state */ | |
320 | + if (vdev->started) { | |
321 | + ret = vhost_user_blk_start(vdev); | |
322 | + if (ret < 0) { | |
323 | + error_report("vhost-user-blk: vhost start failed: %s", | |
324 | + strerror(-ret)); | |
325 | + return ret; | |
326 | + } | |
327 | + } | |
328 | + | |
329 | + return 0; | |
330 | +} | |
331 | + | |
332 | +static void vhost_user_blk_disconnect(DeviceState *dev) | |
333 | +{ | |
334 | + VirtIODevice *vdev = VIRTIO_DEVICE(dev); | |
335 | + VHostUserBlk *s = VHOST_USER_BLK(vdev); | |
336 | + | |
337 | + if (!s->connected) { | |
338 | + return; | |
339 | + } | |
340 | + s->connected = false; | |
341 | + | |
342 | + if (s->dev.started) { | |
343 | + vhost_user_blk_stop(vdev); | |
344 | + } | |
345 | + | |
346 | + vhost_dev_cleanup(&s->dev); | |
347 | +} | |
348 | + | |
349 | +static gboolean vhost_user_blk_watch(GIOChannel *chan, GIOCondition cond, | |
350 | + void *opaque) | |
351 | +{ | |
352 | + DeviceState *dev = opaque; | |
353 | + VirtIODevice *vdev = VIRTIO_DEVICE(dev); | |
354 | + VHostUserBlk *s = VHOST_USER_BLK(vdev); | |
355 | + | |
356 | + qemu_chr_fe_disconnect(&s->chardev); | |
357 | + | |
358 | + return true; | |
359 | +} | |
360 | + | |
361 | +static void vhost_user_blk_event(void *opaque, int event) | |
362 | +{ | |
363 | + DeviceState *dev = opaque; | |
364 | + VirtIODevice *vdev = VIRTIO_DEVICE(dev); | |
365 | + VHostUserBlk *s = VHOST_USER_BLK(vdev); | |
366 | + | |
367 | + switch (event) { | |
368 | + case CHR_EVENT_OPENED: | |
369 | + if (vhost_user_blk_connect(dev) < 0) { | |
370 | + qemu_chr_fe_disconnect(&s->chardev); | |
371 | + return; | |
372 | + } | |
373 | + s->watch = qemu_chr_fe_add_watch(&s->chardev, G_IO_HUP, | |
374 | + vhost_user_blk_watch, dev); | |
375 | + break; | |
376 | + case CHR_EVENT_CLOSED: | |
377 | + vhost_user_blk_disconnect(dev); | |
378 | + if (s->watch) { | |
379 | + g_source_remove(s->watch); | |
380 | + s->watch = 0; | |
381 | + } | |
382 | + break; | |
383 | + } | |
384 | +} | |
385 | + | |
274 | 386 | static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) |
275 | 387 | { |
276 | 388 | VirtIODevice *vdev = VIRTIO_DEVICE(dev); |
277 | 389 | VHostUserBlk *s = VHOST_USER_BLK(vdev); |
278 | - struct vhost_virtqueue *vqs = NULL; | |
390 | + Error *err = NULL; | |
279 | 391 | int i, ret; |
280 | 392 | |
281 | 393 | if (!s->chardev.chr) { |
@@ -306,27 +418,29 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) | ||
306 | 418 | } |
307 | 419 | |
308 | 420 | s->inflight = g_new0(struct vhost_inflight, 1); |
421 | + s->vqs = g_new(struct vhost_virtqueue, s->num_queues); | |
422 | + s->watch = 0; | |
423 | + s->connected = false; | |
309 | 424 | |
310 | - s->dev.nvqs = s->num_queues; | |
311 | - s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs); | |
312 | - s->dev.vq_index = 0; | |
313 | - s->dev.backend_features = 0; | |
314 | - vqs = s->dev.vqs; | |
315 | - | |
316 | - vhost_dev_set_config_notifier(&s->dev, &blk_ops); | |
425 | + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event, | |
426 | + NULL, (void *)dev, NULL, true); | |
317 | 427 | |
318 | - ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0); | |
319 | - if (ret < 0) { | |
320 | - error_setg(errp, "vhost-user-blk: vhost initialization failed: %s", | |
321 | - strerror(-ret)); | |
428 | +reconnect: | |
429 | + if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) { | |
430 | + error_report_err(err); | |
322 | 431 | goto virtio_err; |
323 | 432 | } |
324 | 433 | |
434 | + /* check whether vhost_user_blk_connect() failed or not */ | |
435 | + if (!s->connected) { | |
436 | + goto reconnect; | |
437 | + } | |
438 | + | |
325 | 439 | ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg, |
326 | - sizeof(struct virtio_blk_config)); | |
440 | + sizeof(struct virtio_blk_config)); | |
327 | 441 | if (ret < 0) { |
328 | - error_setg(errp, "vhost-user-blk: get block config failed"); | |
329 | - goto vhost_err; | |
442 | + error_report("vhost-user-blk: get block config failed"); | |
443 | + goto reconnect; | |
330 | 444 | } |
331 | 445 | |
332 | 446 | if (s->blkcfg.num_queues != s->num_queues) { |
@@ -335,10 +449,8 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) | ||
335 | 449 | |
336 | 450 | return; |
337 | 451 | |
338 | -vhost_err: | |
339 | - vhost_dev_cleanup(&s->dev); | |
340 | 452 | virtio_err: |
341 | - g_free(vqs); | |
453 | + g_free(s->vqs); | |
342 | 454 | g_free(s->inflight); |
343 | 455 | virtio_cleanup(vdev); |
344 | 456 | vhost_user_cleanup(&s->vhost_user); |
@@ -348,12 +460,13 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp) | ||
348 | 460 | { |
349 | 461 | VirtIODevice *vdev = VIRTIO_DEVICE(dev); |
350 | 462 | VHostUserBlk *s = VHOST_USER_BLK(dev); |
351 | - struct vhost_virtqueue *vqs = s->dev.vqs; | |
352 | 463 | |
353 | - vhost_user_blk_set_status(vdev, 0); | |
464 | + virtio_set_status(vdev, 0); | |
465 | + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, | |
466 | + NULL, NULL, NULL, false); | |
354 | 467 | vhost_dev_cleanup(&s->dev); |
355 | 468 | vhost_dev_free_inflight(s->inflight); |
356 | - g_free(vqs); | |
469 | + g_free(s->vqs); | |
357 | 470 | g_free(s->inflight); |
358 | 471 | virtio_cleanup(vdev); |
359 | 472 | vhost_user_cleanup(&s->vhost_user); |
@@ -102,9 +102,26 @@ const size_t hw_compat_2_7_len = G_N_ELEMENTS(hw_compat_2_7); | ||
102 | 102 | |
103 | 103 | GlobalProperty hw_compat_2_6[] = { |
104 | 104 | { "virtio-mmio", "format_transport_address", "off" }, |
105 | - /* Optional because not all virtio-pci devices support legacy mode */ | |
106 | - { "virtio-pci", "disable-modern", "on", .optional = true }, | |
107 | - { "virtio-pci", "disable-legacy", "off", .optional = true }, | |
105 | + /* | |
106 | + * don't include devices which are modern-only | |
107 | + * ie keyboard, mouse, tablet, gpu, vga & crypto | |
108 | + */ | |
109 | + { "virtio-9p-pci", "disable-modern", "on" }, | |
110 | + { "virtio-9p-pci", "disable-legacy", "off" }, | |
111 | + { "virtio-balloon-pci", "disable-modern", "on" }, | |
112 | + { "virtio-balloon-pci", "disable-legacy", "off" }, | |
113 | + { "virtio-blk-pci", "disable-modern", "on" }, | |
114 | + { "virtio-blk-pci", "disable-legacy", "off" }, | |
115 | + { "virtio-input-host-pci", "disable-modern", "on" }, | |
116 | + { "virtio-input-host-pci", "disable-legacy", "off" }, | |
117 | + { "virtio-net-pci", "disable-modern", "on" }, | |
118 | + { "virtio-net-pci", "disable-legacy", "off" }, | |
119 | + { "virtio-rng-pci", "disable-modern", "on" }, | |
120 | + { "virtio-rng-pci", "disable-legacy", "off" }, | |
121 | + { "virtio-scsi-pci", "disable-modern", "on" }, | |
122 | + { "virtio-scsi-pci", "disable-legacy", "off" }, | |
123 | + { "virtio-serial-pci", "disable-modern", "on" }, | |
124 | + { "virtio-serial-pci", "disable-legacy", "off" }, | |
108 | 125 | }; |
109 | 126 | const size_t hw_compat_2_6_len = G_N_ELEMENTS(hw_compat_2_6); |
110 | 127 |
@@ -47,7 +47,9 @@ static void virtio_gpu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) | ||
47 | 47 | Error *local_error = NULL; |
48 | 48 | |
49 | 49 | qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); |
50 | - virtio_pci_force_virtio_1(vpci_dev); | |
50 | + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { | |
51 | + return; | |
52 | + } | |
51 | 53 | object_property_set_bool(OBJECT(vdev), true, "realized", &local_error); |
52 | 54 | |
53 | 55 | if (local_error) { |
@@ -154,7 +154,9 @@ static void virtio_vga_realize(VirtIOPCIProxy *vpci_dev, Error **errp) | ||
154 | 154 | |
155 | 155 | /* init virtio bits */ |
156 | 156 | qdev_set_parent_bus(DEVICE(g), BUS(&vpci_dev->bus)); |
157 | - virtio_pci_force_virtio_1(vpci_dev); | |
157 | + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { | |
158 | + return; | |
159 | + } | |
158 | 160 | object_property_set_bool(OBJECT(g), true, "realized", &err); |
159 | 161 | if (err) { |
160 | 162 | error_propagate(errp, err); |
@@ -59,6 +59,7 @@ | ||
59 | 59 | #include "hw/i386/x86-iommu.h" |
60 | 60 | |
61 | 61 | #include "hw/acpi/aml-build.h" |
62 | +#include "hw/acpi/pci.h" | |
62 | 63 | |
63 | 64 | #include "qom/qom-qobject.h" |
64 | 65 | #include "hw/i386/amd_iommu.h" |
@@ -87,11 +88,6 @@ | ||
87 | 88 | /* Default IOAPIC ID */ |
88 | 89 | #define ACPI_BUILD_IOAPIC_ID 0x0 |
89 | 90 | |
90 | -typedef struct AcpiMcfgInfo { | |
91 | - uint64_t mcfg_base; | |
92 | - uint32_t mcfg_size; | |
93 | -} AcpiMcfgInfo; | |
94 | - | |
95 | 91 | typedef struct AcpiPmInfo { |
96 | 92 | bool s3_disabled; |
97 | 93 | bool s4_disabled; |
@@ -2413,29 +2409,16 @@ static void | ||
2413 | 2409 | build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) |
2414 | 2410 | { |
2415 | 2411 | AcpiTableMcfg *mcfg; |
2416 | - const char *sig; | |
2417 | 2412 | int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); |
2418 | 2413 | |
2419 | 2414 | mcfg = acpi_data_push(table_data, len); |
2420 | - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); | |
2415 | + mcfg->allocation[0].address = cpu_to_le64(info->base); | |
2421 | 2416 | /* Only a single allocation so no need to play with segments */ |
2422 | 2417 | mcfg->allocation[0].pci_segment = cpu_to_le16(0); |
2423 | 2418 | mcfg->allocation[0].start_bus_number = 0; |
2424 | - mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); | |
2419 | + mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1); | |
2425 | 2420 | |
2426 | - /* MCFG is used for ECAM which can be enabled or disabled by guest. | |
2427 | - * To avoid table size changes (which create migration issues), | |
2428 | - * always create the table even if there are no allocations, | |
2429 | - * but set the signature to a reserved value in this case. | |
2430 | - * ACPI spec requires OSPMs to ignore such tables. | |
2431 | - */ | |
2432 | - if (info->mcfg_base == PCIE_BASE_ADDR_UNMAPPED) { | |
2433 | - /* Reserved signature: ignored by OSPM */ | |
2434 | - sig = "QEMU"; | |
2435 | - } else { | |
2436 | - sig = "MCFG"; | |
2437 | - } | |
2438 | - build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL); | |
2421 | + build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL); | |
2439 | 2422 | } |
2440 | 2423 | |
2441 | 2424 | /* |
@@ -2602,12 +2585,15 @@ static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) | ||
2602 | 2585 | if (!o) { |
2603 | 2586 | return false; |
2604 | 2587 | } |
2605 | - mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); | |
2588 | + mcfg->base = qnum_get_uint(qobject_to(QNum, o)); | |
2606 | 2589 | qobject_unref(o); |
2590 | + if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) { | |
2591 | + return false; | |
2592 | + } | |
2607 | 2593 | |
2608 | 2594 | o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); |
2609 | 2595 | assert(o); |
2610 | - mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); | |
2596 | + mcfg->size = qnum_get_uint(qobject_to(QNum, o)); | |
2611 | 2597 | qobject_unref(o); |
2612 | 2598 | return true; |
2613 | 2599 | } |
@@ -66,11 +66,6 @@ static int pxb_bus_num(PCIBus *bus) | ||
66 | 66 | return pxb->bus_nr; |
67 | 67 | } |
68 | 68 | |
69 | -static bool pxb_is_root(PCIBus *bus) | |
70 | -{ | |
71 | - return true; /* by definition */ | |
72 | -} | |
73 | - | |
74 | 69 | static uint16_t pxb_bus_numa_node(PCIBus *bus) |
75 | 70 | { |
76 | 71 | PXBDev *pxb = convert_to_pxb(bus->parent_dev); |
@@ -83,7 +78,6 @@ static void pxb_bus_class_init(ObjectClass *class, void *data) | ||
83 | 78 | PCIBusClass *pbc = PCI_BUS_CLASS(class); |
84 | 79 | |
85 | 80 | pbc->bus_num = pxb_bus_num; |
86 | - pbc->is_root = pxb_is_root; | |
87 | 81 | pbc->numa_node = pxb_bus_numa_node; |
88 | 82 | } |
89 | 83 |
@@ -129,14 +129,9 @@ static void pci_bus_unrealize(BusState *qbus, Error **errp) | ||
129 | 129 | vmstate_unregister(NULL, &vmstate_pcibus, bus); |
130 | 130 | } |
131 | 131 | |
132 | -static bool pcibus_is_root(PCIBus *bus) | |
133 | -{ | |
134 | - return !bus->parent_dev; | |
135 | -} | |
136 | - | |
137 | 132 | static int pcibus_num(PCIBus *bus) |
138 | 133 | { |
139 | - if (pcibus_is_root(bus)) { | |
134 | + if (pci_bus_is_root(bus)) { | |
140 | 135 | return 0; /* pci host bridge */ |
141 | 136 | } |
142 | 137 | return bus->parent_dev->config[PCI_SECONDARY_BUS]; |
@@ -164,7 +159,6 @@ static void pci_bus_class_init(ObjectClass *klass, void *data) | ||
164 | 159 | k->unrealize = pci_bus_unrealize; |
165 | 160 | k->reset = pcibus_reset; |
166 | 161 | |
167 | - pbc->is_root = pcibus_is_root; | |
168 | 162 | pbc->bus_num = pcibus_num; |
169 | 163 | pbc->numa_node = pcibus_numa_node; |
170 | 164 | pbc->allows_extended_config_space = pcibus_allows_extended_config_space; |
@@ -398,6 +392,7 @@ static void pci_root_bus_init(PCIBus *bus, DeviceState *parent, | ||
398 | 392 | bus->slot_reserved_mask = 0x0; |
399 | 393 | bus->address_space_mem = address_space_mem; |
400 | 394 | bus->address_space_io = address_space_io; |
395 | + bus->flags |= PCI_BUS_IS_ROOT; | |
401 | 396 | |
402 | 397 | /* host bridge */ |
403 | 398 | QLIST_INIT(&bus->child); |
@@ -415,11 +410,6 @@ bool pci_bus_is_express(PCIBus *bus) | ||
415 | 410 | return object_dynamic_cast(OBJECT(bus), TYPE_PCIE_BUS); |
416 | 411 | } |
417 | 412 | |
418 | -bool pci_bus_is_root(PCIBus *bus) | |
419 | -{ | |
420 | - return PCI_BUS_GET_CLASS(bus)->is_root(bus); | |
421 | -} | |
422 | - | |
423 | 413 | bool pci_bus_allows_extended_config_space(PCIBus *bus) |
424 | 414 | { |
425 | 415 | return PCI_BUS_GET_CLASS(bus)->allows_extended_config_space(bus); |
@@ -47,11 +47,6 @@ static void pcie_mmcfg_data_write(void *opaque, hwaddr mmcfg_addr, | ||
47 | 47 | } |
48 | 48 | addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr); |
49 | 49 | limit = pci_config_size(pci_dev); |
50 | - if (limit <= addr) { | |
51 | - /* conventional pci device can be behind pcie-to-pci bridge. | |
52 | - 256 <= addr < 4K has no effects. */ | |
53 | - return; | |
54 | - } | |
55 | 50 | pci_host_config_write_common(pci_dev, addr, limit, val, len); |
56 | 51 | } |
57 | 52 |
@@ -70,11 +65,6 @@ static uint64_t pcie_mmcfg_data_read(void *opaque, | ||
70 | 65 | } |
71 | 66 | addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr); |
72 | 67 | limit = pci_config_size(pci_dev); |
73 | - if (limit <= addr) { | |
74 | - /* conventional pci device can be behind pcie-to-pci bridge. | |
75 | - 256 <= addr < 4K has no effects. */ | |
76 | - return ~0x0; | |
77 | - } | |
78 | 68 | return pci_host_config_read_common(pci_dev, addr, limit, len); |
79 | 69 | } |
80 | 70 |
@@ -51,7 +51,9 @@ static void virtio_crypto_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) | ||
51 | 51 | } |
52 | 52 | |
53 | 53 | qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); |
54 | - virtio_pci_force_virtio_1(vpci_dev); | |
54 | + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { | |
55 | + return; | |
56 | + } | |
55 | 57 | object_property_set_bool(OBJECT(vdev), true, "realized", errp); |
56 | 58 | object_property_set_link(OBJECT(vcrypto), |
57 | 59 | OBJECT(vcrypto->vdev.conf.cryptodev), "cryptodev", |
@@ -48,7 +48,9 @@ static void virtio_input_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) | ||
48 | 48 | DeviceState *vdev = DEVICE(&vinput->vdev); |
49 | 49 | |
50 | 50 | qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); |
51 | - virtio_pci_force_virtio_1(vpci_dev); | |
51 | + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { | |
52 | + return; | |
53 | + } | |
52 | 54 | object_property_set_bool(OBJECT(vdev), true, "realized", errp); |
53 | 55 | } |
54 | 56 |
@@ -20,6 +20,7 @@ | ||
20 | 20 | #include "standard-headers/linux/virtio_pci.h" |
21 | 21 | #include "hw/virtio/virtio.h" |
22 | 22 | #include "hw/pci/pci.h" |
23 | +#include "hw/pci/pci_bus.h" | |
23 | 24 | #include "qapi/error.h" |
24 | 25 | #include "qemu/error-report.h" |
25 | 26 | #include "hw/pci/msi.h" |
@@ -1721,16 +1722,22 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) | ||
1721 | 1722 | /* PCI BAR regions must be powers of 2 */ |
1722 | 1723 | pow2ceil(proxy->notify.offset + proxy->notify.size)); |
1723 | 1724 | |
1724 | - if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { | |
1725 | - proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; | |
1726 | - } | |
1727 | - | |
1728 | - if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) { | |
1729 | - error_setg(errp, "device cannot work as neither modern nor legacy mode" | |
1730 | - " is enabled"); | |
1731 | - error_append_hint(errp, "Set either disable-modern or disable-legacy" | |
1732 | - " to off\n"); | |
1733 | - return; | |
1725 | + if ((proxy->disable_legacy == ON_OFF_AUTO_ON) || | |
1726 | + ((proxy->disable_legacy == ON_OFF_AUTO_AUTO) && pcie_port)) { | |
1727 | + if (proxy->disable_modern) { | |
1728 | + error_setg(errp, "device cannot work as neither modern nor " | |
1729 | + "legacy mode is enabled"); | |
1730 | + error_append_hint(errp, "Set either disable-modern or " | |
1731 | + "disable-legacy to off\n"); | |
1732 | + return; | |
1733 | + } | |
1734 | + proxy->mode = VIRTIO_PCI_MODE_MODERN; | |
1735 | + } else { | |
1736 | + if (proxy->disable_modern) { | |
1737 | + proxy->mode = VIRTIO_PCI_MODE_LEGACY; | |
1738 | + } else { | |
1739 | + proxy->mode = VIRTIO_PCI_MODE_TRANSITIONAL; | |
1740 | + } | |
1734 | 1741 | } |
1735 | 1742 | |
1736 | 1743 | if (pcie_port && pci_is_express(pci_dev)) { |
@@ -15,6 +15,7 @@ | ||
15 | 15 | #ifndef QEMU_VIRTIO_PCI_H |
16 | 16 | #define QEMU_VIRTIO_PCI_H |
17 | 17 | |
18 | +#include "qapi/error.h" | |
18 | 19 | #include "hw/pci/msi.h" |
19 | 20 | #include "hw/virtio/virtio-bus.h" |
20 | 21 |
@@ -118,6 +119,12 @@ typedef struct VirtIOPCIQueue { | ||
118 | 119 | uint32_t used[2]; |
119 | 120 | } VirtIOPCIQueue; |
120 | 121 | |
122 | +typedef enum { | |
123 | + VIRTIO_PCI_MODE_LEGACY, | |
124 | + VIRTIO_PCI_MODE_TRANSITIONAL, | |
125 | + VIRTIO_PCI_MODE_MODERN, | |
126 | +} VirtIOPCIMode; | |
127 | + | |
121 | 128 | struct VirtIOPCIProxy { |
122 | 129 | PCIDevice pci_dev; |
123 | 130 | MemoryRegion bar; |
@@ -142,6 +149,7 @@ struct VirtIOPCIProxy { | ||
142 | 149 | bool disable_modern; |
143 | 150 | bool ignore_backend_features; |
144 | 151 | OnOffAuto disable_legacy; |
152 | + VirtIOPCIMode mode; | |
145 | 153 | uint32_t class_code; |
146 | 154 | uint32_t nvectors; |
147 | 155 | uint32_t dfselect; |
@@ -156,23 +164,34 @@ struct VirtIOPCIProxy { | ||
156 | 164 | |
157 | 165 | static inline bool virtio_pci_modern(VirtIOPCIProxy *proxy) |
158 | 166 | { |
159 | - return !proxy->disable_modern; | |
167 | + return proxy->mode != VIRTIO_PCI_MODE_LEGACY; | |
160 | 168 | } |
161 | 169 | |
162 | 170 | static inline bool virtio_pci_legacy(VirtIOPCIProxy *proxy) |
163 | 171 | { |
164 | - return proxy->disable_legacy == ON_OFF_AUTO_OFF; | |
172 | + return proxy->mode != VIRTIO_PCI_MODE_MODERN; | |
165 | 173 | } |
166 | 174 | |
167 | -static inline void virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy) | |
175 | +static inline bool virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy, | |
176 | + Error **errp) | |
168 | 177 | { |
169 | - proxy->disable_modern = false; | |
170 | - proxy->disable_legacy = ON_OFF_AUTO_ON; | |
178 | + if (proxy->disable_legacy == ON_OFF_AUTO_OFF) { | |
179 | + error_setg(errp, "Unable to set disable-legacy=off on a virtio-1.0 " | |
180 | + "only device"); | |
181 | + return false; | |
182 | + } | |
183 | + if (proxy->disable_modern == true) { | |
184 | + error_setg(errp, "Unable to set disable-modern=on on a virtio-1.0 " | |
185 | + "only device"); | |
186 | + return false; | |
187 | + } | |
188 | + proxy->mode = VIRTIO_PCI_MODE_MODERN; | |
189 | + return true; | |
171 | 190 | } |
172 | 191 | |
173 | 192 | static inline void virtio_pci_disable_modern(VirtIOPCIProxy *proxy) |
174 | 193 | { |
175 | - proxy->disable_modern = true; | |
194 | + proxy->mode = VIRTIO_PCI_MODE_LEGACY; | |
176 | 195 | } |
177 | 196 | |
178 | 197 | /* |
@@ -1162,10 +1162,16 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) | ||
1162 | 1162 | } |
1163 | 1163 | } |
1164 | 1164 | } |
1165 | + vdev->started = val & VIRTIO_CONFIG_S_DRIVER_OK; | |
1166 | + if (unlikely(vdev->start_on_kick && vdev->started)) { | |
1167 | + vdev->start_on_kick = false; | |
1168 | + } | |
1169 | + | |
1165 | 1170 | if (k->set_status) { |
1166 | 1171 | k->set_status(vdev, val); |
1167 | 1172 | } |
1168 | 1173 | vdev->status = val; |
1174 | + | |
1169 | 1175 | return 0; |
1170 | 1176 | } |
1171 | 1177 |
@@ -1208,6 +1214,9 @@ void virtio_reset(void *opaque) | ||
1208 | 1214 | k->reset(vdev); |
1209 | 1215 | } |
1210 | 1216 | |
1217 | + vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && | |
1218 | + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)); | |
1219 | + vdev->started = false; | |
1211 | 1220 | vdev->broken = false; |
1212 | 1221 | vdev->guest_features = 0; |
1213 | 1222 | vdev->queue_sel = 0; |
@@ -1518,14 +1527,21 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) | ||
1518 | 1527 | |
1519 | 1528 | static bool virtio_queue_notify_aio_vq(VirtQueue *vq) |
1520 | 1529 | { |
1530 | + bool ret = false; | |
1531 | + | |
1521 | 1532 | if (vq->vring.desc && vq->handle_aio_output) { |
1522 | 1533 | VirtIODevice *vdev = vq->vdev; |
1523 | 1534 | |
1524 | 1535 | trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); |
1525 | - return vq->handle_aio_output(vdev, vq); | |
1536 | + ret = vq->handle_aio_output(vdev, vq); | |
1537 | + | |
1538 | + if (unlikely(vdev->start_on_kick)) { | |
1539 | + vdev->started = true; | |
1540 | + vdev->start_on_kick = false; | |
1541 | + } | |
1526 | 1542 | } |
1527 | 1543 | |
1528 | - return false; | |
1544 | + return ret; | |
1529 | 1545 | } |
1530 | 1546 | |
1531 | 1547 | static void virtio_queue_notify_vq(VirtQueue *vq) |
@@ -1539,6 +1555,11 @@ static void virtio_queue_notify_vq(VirtQueue *vq) | ||
1539 | 1555 | |
1540 | 1556 | trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); |
1541 | 1557 | vq->handle_output(vdev, vq); |
1558 | + | |
1559 | + if (unlikely(vdev->start_on_kick)) { | |
1560 | + vdev->started = true; | |
1561 | + vdev->start_on_kick = false; | |
1562 | + } | |
1542 | 1563 | } |
1543 | 1564 | } |
1544 | 1565 |
@@ -1556,6 +1577,11 @@ void virtio_queue_notify(VirtIODevice *vdev, int n) | ||
1556 | 1577 | } else if (vq->handle_output) { |
1557 | 1578 | vq->handle_output(vdev, vq); |
1558 | 1579 | } |
1580 | + | |
1581 | + if (unlikely(vdev->start_on_kick)) { | |
1582 | + vdev->started = true; | |
1583 | + vdev->start_on_kick = false; | |
1584 | + } | |
1559 | 1585 | } |
1560 | 1586 | |
1561 | 1587 | uint16_t virtio_queue_vector(VirtIODevice *vdev, int n) |
@@ -1770,6 +1796,13 @@ static bool virtio_broken_needed(void *opaque) | ||
1770 | 1796 | return vdev->broken; |
1771 | 1797 | } |
1772 | 1798 | |
1799 | +static bool virtio_started_needed(void *opaque) | |
1800 | +{ | |
1801 | + VirtIODevice *vdev = opaque; | |
1802 | + | |
1803 | + return vdev->started; | |
1804 | +} | |
1805 | + | |
1773 | 1806 | static const VMStateDescription vmstate_virtqueue = { |
1774 | 1807 | .name = "virtqueue_state", |
1775 | 1808 | .version_id = 1, |
@@ -1898,6 +1931,17 @@ static const VMStateDescription vmstate_virtio_broken = { | ||
1898 | 1931 | } |
1899 | 1932 | }; |
1900 | 1933 | |
1934 | +static const VMStateDescription vmstate_virtio_started = { | |
1935 | + .name = "virtio/started", | |
1936 | + .version_id = 1, | |
1937 | + .minimum_version_id = 1, | |
1938 | + .needed = &virtio_started_needed, | |
1939 | + .fields = (VMStateField[]) { | |
1940 | + VMSTATE_BOOL(started, VirtIODevice), | |
1941 | + VMSTATE_END_OF_LIST() | |
1942 | + } | |
1943 | +}; | |
1944 | + | |
1901 | 1945 | static const VMStateDescription vmstate_virtio = { |
1902 | 1946 | .name = "virtio", |
1903 | 1947 | .version_id = 1, |
@@ -1913,6 +1957,7 @@ static const VMStateDescription vmstate_virtio = { | ||
1913 | 1957 | &vmstate_virtio_ringsize, |
1914 | 1958 | &vmstate_virtio_broken, |
1915 | 1959 | &vmstate_virtio_extra_state, |
1960 | + &vmstate_virtio_started, | |
1916 | 1961 | NULL |
1917 | 1962 | } |
1918 | 1963 | }; |
@@ -2246,7 +2291,7 @@ static void virtio_vmstate_change(void *opaque, int running, RunState state) | ||
2246 | 2291 | VirtIODevice *vdev = opaque; |
2247 | 2292 | BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); |
2248 | 2293 | VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); |
2249 | - bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK); | |
2294 | + bool backend_run = running && vdev->started; | |
2250 | 2295 | vdev->vm_running = running; |
2251 | 2296 | |
2252 | 2297 | if (backend_run) { |
@@ -2286,6 +2331,9 @@ void virtio_init(VirtIODevice *vdev, const char *name, | ||
2286 | 2331 | g_malloc0(sizeof(*vdev->vector_queues) * nvectors); |
2287 | 2332 | } |
2288 | 2333 | |
2334 | + vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && | |
2335 | + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)); | |
2336 | + vdev->started = false; | |
2289 | 2337 | vdev->device_id = device_id; |
2290 | 2338 | vdev->status = 0; |
2291 | 2339 | atomic_set(&vdev->isr, 0); |
@@ -0,0 +1,33 @@ | ||
1 | +/* | |
2 | + * Support for generating PCI related ACPI tables and passing them to Guests | |
3 | + * | |
4 | + * Copyright (C) 2006 Fabrice Bellard | |
5 | + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> | |
6 | + * Copyright (C) 2013-2019 Red Hat Inc | |
7 | + * Copyright (C) 2019 Intel Corporation | |
8 | + * | |
9 | + * Author: Wei Yang <richardw.yang@linux.intel.com> | |
10 | + * Author: Michael S. Tsirkin <mst@redhat.com> | |
11 | + * | |
12 | + * This program is free software; you can redistribute it and/or modify | |
13 | + * it under the terms of the GNU General Public License as published by | |
14 | + * the Free Software Foundation; either version 2 of the License, or | |
15 | + * (at your option) any later version. | |
16 | + | |
17 | + * This program is distributed in the hope that it will be useful, | |
18 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | + * GNU General Public License for more details. | |
21 | + | |
22 | + * You should have received a copy of the GNU General Public License along | |
23 | + * with this program; if not, see <http://www.gnu.org/licenses/>. | |
24 | + */ | |
25 | +#ifndef HW_ACPI_PCI_H | |
26 | +#define HW_ACPI_PCI_H | |
27 | + | |
28 | +typedef struct AcpiMcfgInfo { | |
29 | + uint64_t base; | |
30 | + uint32_t size; | |
31 | +} AcpiMcfgInfo; | |
32 | + | |
33 | +#endif |
@@ -395,7 +395,6 @@ typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); | ||
395 | 395 | #define TYPE_PCIE_BUS "PCIE" |
396 | 396 | |
397 | 397 | bool pci_bus_is_express(PCIBus *bus); |
398 | -bool pci_bus_is_root(PCIBus *bus); | |
399 | 398 | bool pci_bus_allows_extended_config_space(PCIBus *bus); |
400 | 399 | |
401 | 400 | void pci_root_bus_new_inplace(PCIBus *bus, size_t bus_size, DeviceState *parent, |
@@ -15,14 +15,19 @@ typedef struct PCIBusClass { | ||
15 | 15 | BusClass parent_class; |
16 | 16 | /*< public >*/ |
17 | 17 | |
18 | - bool (*is_root)(PCIBus *bus); | |
19 | 18 | int (*bus_num)(PCIBus *bus); |
20 | 19 | uint16_t (*numa_node)(PCIBus *bus); |
21 | 20 | bool (*allows_extended_config_space)(PCIBus *bus); |
22 | 21 | } PCIBusClass; |
23 | 22 | |
23 | +enum PCIBusFlags { | |
24 | + /* This bus is the root of a PCI domain */ | |
25 | + PCI_BUS_IS_ROOT = 0x0001, | |
26 | +}; | |
27 | + | |
24 | 28 | struct PCIBus { |
25 | 29 | BusState qbus; |
30 | + enum PCIBusFlags flags; | |
26 | 31 | PCIIOMMUFunc iommu_fn; |
27 | 32 | void *iommu_opaque; |
28 | 33 | uint8_t devfn_min; |
@@ -47,4 +52,9 @@ struct PCIBus { | ||
47 | 52 | Notifier machine_done; |
48 | 53 | }; |
49 | 54 | |
55 | +static inline bool pci_bus_is_root(PCIBus *bus) | |
56 | +{ | |
57 | + return !!(bus->flags & PCI_BUS_IS_ROOT); | |
58 | +} | |
59 | + | |
50 | 60 | #endif /* QEMU_PCI_BUS_H */ |
@@ -251,8 +251,6 @@ struct PropertyInfo { | ||
251 | 251 | /** |
252 | 252 | * GlobalProperty: |
253 | 253 | * @used: Set to true if property was used when initializing a device. |
254 | - * @optional: If set to true, GlobalProperty will be skipped without errors | |
255 | - * if the property doesn't exist. | |
256 | 254 | * |
257 | 255 | * An error is fatal for non-hotplugged devices, when the global is applied. |
258 | 256 | */ |
@@ -261,7 +259,6 @@ typedef struct GlobalProperty { | ||
261 | 259 | const char *property; |
262 | 260 | const char *value; |
263 | 261 | bool used; |
264 | - bool optional; | |
265 | 262 | } GlobalProperty; |
266 | 263 | |
267 | 264 | static inline void |
@@ -38,6 +38,9 @@ typedef struct VHostUserBlk { | ||
38 | 38 | struct vhost_dev dev; |
39 | 39 | struct vhost_inflight *inflight; |
40 | 40 | VhostUserState vhost_user; |
41 | + struct vhost_virtqueue *vqs; | |
42 | + guint watch; | |
43 | + bool connected; | |
41 | 44 | } VHostUserBlk; |
42 | 45 | |
43 | 46 | #endif |
@@ -105,6 +105,8 @@ struct VirtIODevice | ||
105 | 105 | uint16_t device_id; |
106 | 106 | bool vm_running; |
107 | 107 | bool broken; /* device in invalid state, needs reset */ |
108 | + bool started; | |
109 | + bool start_on_kick; /* virtio 1.0 transitional devices support that */ | |
108 | 110 | VMChangeStateEntry *vmstate; |
109 | 111 | char *bus_name; |
110 | 112 | uint8_t device_endian; |
@@ -236,7 +236,6 @@ static void chr_closed_bh(void *opaque) | ||
236 | 236 | s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); |
237 | 237 | |
238 | 238 | qmp_set_link(name, false, &err); |
239 | - vhost_user_stop(queues, ncs); | |
240 | 239 | |
241 | 240 | qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, |
242 | 241 | NULL, opaque, NULL, true); |
@@ -385,9 +385,6 @@ void object_apply_global_props(Object *obj, const GPtrArray *props, Error **errp | ||
385 | 385 | if (object_dynamic_cast(obj, p->driver) == NULL) { |
386 | 386 | continue; |
387 | 387 | } |
388 | - if (p->optional && !object_property_find(obj, p->property, NULL)) { | |
389 | - continue; | |
390 | - } | |
391 | 388 | p->used = true; |
392 | 389 | object_property_parse(obj, p->value, p->property, &err); |
393 | 390 | if (err != NULL) { |
@@ -51,19 +51,7 @@ uint32_t acpi_find_rsdp_address(QTestState *qts) | ||
51 | 51 | return off; |
52 | 52 | } |
53 | 53 | |
54 | -uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table) | |
55 | -{ | |
56 | - uint64_t xsdt_physical_address; | |
57 | - uint8_t revision = rsdp_table[15 /* Revision offset */]; | |
58 | - | |
59 | - /* We must have revision 2 if we're looking for an XSDT pointer */ | |
60 | - g_assert(revision == 2); | |
61 | - | |
62 | - memcpy(&xsdt_physical_address, &rsdp_table[24 /* XsdtAddress offset */], 8); | |
63 | - return le64_to_cpu(xsdt_physical_address); | |
64 | -} | |
65 | - | |
66 | -void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table) | |
54 | +void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table) | |
67 | 55 | { |
68 | 56 | uint8_t revision; |
69 | 57 |
@@ -91,13 +79,15 @@ void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table) | ||
91 | 79 | * actual one. |
92 | 80 | */ |
93 | 81 | void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, |
94 | - const uint8_t *addr_ptr, const char *sig, | |
82 | + const uint8_t *addr_ptr, int addr_size, const char *sig, | |
95 | 83 | bool verify_checksum) |
96 | 84 | { |
97 | - uint32_t addr, len; | |
85 | + uint32_t len; | |
86 | + uint64_t addr = 0; | |
98 | 87 | |
99 | - memcpy(&addr, addr_ptr , sizeof(addr)); | |
100 | - addr = le32_to_cpu(addr); | |
88 | + g_assert(addr_size == 4 || addr_size == 8); | |
89 | + memcpy(&addr, addr_ptr , addr_size); | |
90 | + addr = le64_to_cpu(addr); | |
101 | 91 | qtest_memread(qts, addr + 4, &len, 4); /* Length of ACPI table */ |
102 | 92 | *aml_len = le32_to_cpu(len); |
103 | 93 | *aml = g_malloc0(*aml_len); |
@@ -111,3 +101,47 @@ void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, | ||
111 | 101 | g_assert(!acpi_calc_checksum(*aml, *aml_len)); |
112 | 102 | } |
113 | 103 | } |
104 | + | |
105 | +#define GUID_SIZE 16 | |
106 | +static const uint8_t AcpiTestSupportGuid[GUID_SIZE] = { | |
107 | + 0xb1, 0xa6, 0x87, 0xab, | |
108 | + 0x34, 0x20, | |
109 | + 0xa0, 0xbd, | |
110 | + 0x71, 0xbd, 0x37, 0x50, 0x07, 0x75, 0x77, 0x85 }; | |
111 | + | |
112 | +typedef struct { | |
113 | + uint8_t signature_guid[GUID_SIZE]; | |
114 | + uint64_t rsdp10; | |
115 | + uint64_t rsdp20; | |
116 | +} __attribute__((packed)) UefiTestSupport; | |
117 | + | |
118 | +/* Wait at most 600 seconds (test is slow with TCG and --enable-debug) */ | |
119 | +#define TEST_DELAY (1 * G_USEC_PER_SEC / 10) | |
120 | +#define TEST_CYCLES MAX((600 * G_USEC_PER_SEC / TEST_DELAY), 1) | |
121 | +#define MB 0x100000ULL | |
122 | +uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start, | |
123 | + uint64_t size) | |
124 | +{ | |
125 | + int i, j; | |
126 | + uint8_t data[GUID_SIZE]; | |
127 | + | |
128 | + for (i = 0; i < TEST_CYCLES; ++i) { | |
129 | + for (j = 0; j < size / MB; j++) { | |
130 | + /* look for GUID at every 1Mb block */ | |
131 | + uint64_t addr = start + j * MB; | |
132 | + | |
133 | + qtest_memread(qts, addr, data, sizeof(data)); | |
134 | + if (!memcmp(AcpiTestSupportGuid, data, sizeof(data))) { | |
135 | + UefiTestSupport ret; | |
136 | + | |
137 | + qtest_memread(qts, addr, &ret, sizeof(ret)); | |
138 | + ret.rsdp10 = le64_to_cpu(ret.rsdp10); | |
139 | + ret.rsdp20 = le64_to_cpu(ret.rsdp20); | |
140 | + return ret.rsdp20 ? ret.rsdp20 : ret.rsdp10; | |
141 | + } | |
142 | + } | |
143 | + g_usleep(TEST_DELAY); | |
144 | + } | |
145 | + g_assert_not_reached(); | |
146 | + return 0; | |
147 | +} |
@@ -46,10 +46,11 @@ typedef struct { | ||
46 | 46 | |
47 | 47 | uint8_t acpi_calc_checksum(const uint8_t *data, int len); |
48 | 48 | uint32_t acpi_find_rsdp_address(QTestState *qts); |
49 | -uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table); | |
50 | -void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table); | |
49 | +uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start, | |
50 | + uint64_t size); | |
51 | +void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table); | |
51 | 52 | void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, |
52 | - const uint8_t *addr_ptr, const char *sig, | |
53 | + const uint8_t *addr_ptr, int addr_size, const char *sig, | |
53 | 54 | bool verify_checksum); |
54 | 55 | |
55 | 56 | #endif /* TEST_ACPI_UTILS_H */ |
@@ -24,9 +24,15 @@ | ||
24 | 24 | #define ACPI_REBUILD_EXPECTED_AML "TEST_ACPI_REBUILD_AML" |
25 | 25 | |
26 | 26 | typedef struct { |
27 | + const char *accel; | |
27 | 28 | const char *machine; |
28 | 29 | const char *variant; |
29 | - uint32_t rsdp_addr; | |
30 | + const char *uefi_fl1; | |
31 | + const char *uefi_fl2; | |
32 | + const char *cd; | |
33 | + const uint64_t ram_start; | |
34 | + const uint64_t scan_len; | |
35 | + uint64_t rsdp_addr; | |
30 | 36 | uint8_t rsdp_table[36 /* ACPI 2.0+ RSDP size */]; |
31 | 37 | GArray *tables; |
32 | 38 | uint32_t smbios_ep_addr; |
@@ -77,22 +83,13 @@ static void free_test_data(test_data *data) | ||
77 | 83 | g_array_free(data->tables, true); |
78 | 84 | } |
79 | 85 | |
80 | -static void test_acpi_rsdp_address(test_data *data) | |
81 | -{ | |
82 | - uint32_t off = acpi_find_rsdp_address(data->qts); | |
83 | - g_assert_cmphex(off, <, 0x100000); | |
84 | - data->rsdp_addr = off; | |
85 | -} | |
86 | - | |
87 | 86 | static void test_acpi_rsdp_table(test_data *data) |
88 | 87 | { |
89 | - uint8_t *rsdp_table = data->rsdp_table, revision; | |
90 | - uint32_t addr = data->rsdp_addr; | |
88 | + uint8_t *rsdp_table = data->rsdp_table; | |
91 | 89 | |
92 | - acpi_parse_rsdp_table(data->qts, addr, rsdp_table); | |
93 | - revision = rsdp_table[15 /* Revision offset */]; | |
90 | + acpi_fetch_rsdp_table(data->qts, data->rsdp_addr, rsdp_table); | |
94 | 91 | |
95 | - switch (revision) { | |
92 | + switch (rsdp_table[15 /* Revision offset */]) { | |
96 | 93 | case 0: /* ACPI 1.0 RSDP */ |
97 | 94 | /* With rev 1, checksum is only for the first 20 bytes */ |
98 | 95 | g_assert(!acpi_calc_checksum(rsdp_table, 20)); |
@@ -107,21 +104,29 @@ static void test_acpi_rsdp_table(test_data *data) | ||
107 | 104 | } |
108 | 105 | } |
109 | 106 | |
110 | -static void test_acpi_rsdt_table(test_data *data) | |
107 | +static void test_acpi_rxsdt_table(test_data *data) | |
111 | 108 | { |
109 | + const char *sig = "RSDT"; | |
112 | 110 | AcpiSdtTable rsdt = {}; |
111 | + int entry_size = 4; | |
112 | + int addr_off = 16 /* RsdtAddress */; | |
113 | 113 | uint8_t *ent; |
114 | 114 | |
115 | - /* read RSDT table */ | |
115 | + if (data->rsdp_table[15 /* Revision offset */] != 0) { | |
116 | + addr_off = 24 /* XsdtAddress */; | |
117 | + entry_size = 8; | |
118 | + sig = "XSDT"; | |
119 | + } | |
120 | + /* read [RX]SDT table */ | |
116 | 121 | acpi_fetch_table(data->qts, &rsdt.aml, &rsdt.aml_len, |
117 | - &data->rsdp_table[16 /* RsdtAddress */], "RSDT", true); | |
122 | + &data->rsdp_table[addr_off], entry_size, sig, true); | |
118 | 123 | |
119 | 124 | /* Load all tables and add to test list directly RSDT referenced tables */ |
120 | - ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, 4 /* Entry size */) { | |
125 | + ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, entry_size) { | |
121 | 126 | AcpiSdtTable ssdt_table = {}; |
122 | 127 | |
123 | 128 | acpi_fetch_table(data->qts, &ssdt_table.aml, &ssdt_table.aml_len, ent, |
124 | - NULL, true); | |
129 | + entry_size, NULL, true); | |
125 | 130 | /* Add table to ASL test tables list */ |
126 | 131 | g_array_append_val(data->tables, ssdt_table); |
127 | 132 | } |
@@ -134,16 +139,29 @@ static void test_acpi_fadt_table(test_data *data) | ||
134 | 139 | AcpiSdtTable table = g_array_index(data->tables, typeof(table), 0); |
135 | 140 | uint8_t *fadt_aml = table.aml; |
136 | 141 | uint32_t fadt_len = table.aml_len; |
142 | + uint32_t val; | |
143 | + int dsdt_offset = 40 /* DSDT */; | |
144 | + int dsdt_entry_size = 4; | |
137 | 145 | |
138 | 146 | g_assert(compare_signature(&table, "FACP")); |
139 | 147 | |
140 | 148 | /* Since DSDT/FACS isn't in RSDT, add them to ASL test list manually */ |
141 | - acpi_fetch_table(data->qts, &table.aml, &table.aml_len, | |
142 | - fadt_aml + 36 /* FIRMWARE_CTRL */, "FACS", false); | |
143 | - g_array_append_val(data->tables, table); | |
149 | + memcpy(&val, fadt_aml + 112 /* Flags */, 4); | |
150 | + val = le32_to_cpu(val); | |
151 | + if (!(val & 1UL << 20 /* HW_REDUCED_ACPI */)) { | |
152 | + acpi_fetch_table(data->qts, &table.aml, &table.aml_len, | |
153 | + fadt_aml + 36 /* FIRMWARE_CTRL */, 4, "FACS", false); | |
154 | + g_array_append_val(data->tables, table); | |
155 | + } | |
144 | 156 | |
157 | + memcpy(&val, fadt_aml + dsdt_offset, 4); | |
158 | + val = le32_to_cpu(val); | |
159 | + if (!val) { | |
160 | + dsdt_offset = 140 /* X_DSDT */; | |
161 | + dsdt_entry_size = 8; | |
162 | + } | |
145 | 163 | acpi_fetch_table(data->qts, &table.aml, &table.aml_len, |
146 | - fadt_aml + 40 /* DSDT */, "DSDT", true); | |
164 | + fadt_aml + dsdt_offset, dsdt_entry_size, "DSDT", true); | |
147 | 165 | g_array_append_val(data->tables, table); |
148 | 166 | |
149 | 167 | memset(fadt_aml + 36, 0, 4); /* sanitize FIRMWARE_CTRL ptr */ |
@@ -177,11 +195,14 @@ static void dump_aml_files(test_data *data, bool rebuild) | ||
177 | 195 | sdt->aml, ext); |
178 | 196 | fd = g_open(aml_file, O_WRONLY|O_TRUNC|O_CREAT, |
179 | 197 | S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH); |
198 | + if (fd < 0) { | |
199 | + perror(aml_file); | |
200 | + } | |
201 | + g_assert(fd >= 0); | |
180 | 202 | } else { |
181 | 203 | fd = g_file_open_tmp("aml-XXXXXX", &sdt->aml_file, &error); |
182 | 204 | g_assert_no_error(error); |
183 | 205 | } |
184 | - g_assert(fd >= 0); | |
185 | 206 | |
186 | 207 | ret = qemu_write_full(fd, sdt->aml, sdt->aml_len); |
187 | 208 | g_assert(ret == sdt->aml_len); |
@@ -505,23 +526,44 @@ static void test_smbios_structs(test_data *data) | ||
505 | 526 | static void test_acpi_one(const char *params, test_data *data) |
506 | 527 | { |
507 | 528 | char *args; |
508 | - | |
509 | - /* Disable kernel irqchip to be able to override apic irq0. */ | |
510 | - args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off " | |
511 | - "-net none -display none %s " | |
512 | - "-drive id=hd0,if=none,file=%s,format=raw " | |
513 | - "-device ide-hd,drive=hd0 ", | |
514 | - data->machine, "kvm:tcg", | |
515 | - params ? params : "", disk); | |
529 | + bool use_uefi = data->uefi_fl1 && data->uefi_fl2; | |
530 | + | |
531 | + if (use_uefi) { | |
532 | + /* | |
533 | + * TODO: convert '-drive if=pflash' to new syntax (see e33763be7cd3) | |
534 | + * when arm/virt boad starts to support it. | |
535 | + */ | |
536 | + args = g_strdup_printf("-machine %s,accel=%s -nodefaults -nographic " | |
537 | + "-drive if=pflash,format=raw,file=%s,readonly " | |
538 | + "-drive if=pflash,format=raw,file=%s,snapshot=on -cdrom %s %s", | |
539 | + data->machine, data->accel ? data->accel : "kvm:tcg", | |
540 | + data->uefi_fl1, data->uefi_fl2, data->cd, params ? params : ""); | |
541 | + | |
542 | + } else { | |
543 | + /* Disable kernel irqchip to be able to override apic irq0. */ | |
544 | + args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off " | |
545 | + "-net none -display none %s " | |
546 | + "-drive id=hd0,if=none,file=%s,format=raw " | |
547 | + "-device ide-hd,drive=hd0 ", | |
548 | + data->machine, data->accel ? data->accel : "kvm:tcg", | |
549 | + params ? params : "", disk); | |
550 | + } | |
516 | 551 | |
517 | 552 | data->qts = qtest_init(args); |
518 | 553 | |
519 | - boot_sector_test(data->qts); | |
554 | + if (use_uefi) { | |
555 | + g_assert(data->scan_len); | |
556 | + data->rsdp_addr = acpi_find_rsdp_address_uefi(data->qts, | |
557 | + data->ram_start, data->scan_len); | |
558 | + } else { | |
559 | + boot_sector_test(data->qts); | |
560 | + data->rsdp_addr = acpi_find_rsdp_address(data->qts); | |
561 | + g_assert_cmphex(data->rsdp_addr, <, 0x100000); | |
562 | + } | |
520 | 563 | |
521 | 564 | data->tables = g_array_new(false, true, sizeof(AcpiSdtTable)); |
522 | - test_acpi_rsdp_address(data); | |
523 | 565 | test_acpi_rsdp_table(data); |
524 | - test_acpi_rsdt_table(data); | |
566 | + test_acpi_rxsdt_table(data); | |
525 | 567 | test_acpi_fadt_table(data); |
526 | 568 | |
527 | 569 | if (iasl) { |
@@ -532,8 +574,15 @@ static void test_acpi_one(const char *params, test_data *data) | ||
532 | 574 | } |
533 | 575 | } |
534 | 576 | |
535 | - test_smbios_entry_point(data); | |
536 | - test_smbios_structs(data); | |
577 | + /* | |
578 | + * TODO: make SMBIOS tests work with UEFI firmware, | |
579 | + * Bug on uefi-test-tools to provide entry point: | |
580 | + * https://bugs.launchpad.net/qemu/+bug/1821884 | |
581 | + */ | |
582 | + if (!use_uefi) { | |
583 | + test_smbios_entry_point(data); | |
584 | + test_smbios_structs(data); | |
585 | + } | |
537 | 586 | |
538 | 587 | assert(!global_qtest); |
539 | 588 | qtest_quit(data->qts); |
@@ -769,13 +818,14 @@ int main(int argc, char *argv[]) | ||
769 | 818 | const char *arch = qtest_get_arch(); |
770 | 819 | int ret; |
771 | 820 | |
772 | - ret = boot_sector_init(disk); | |
773 | - if(ret) | |
774 | - return ret; | |
775 | - | |
776 | 821 | g_test_init(&argc, &argv, NULL); |
777 | 822 | |
778 | 823 | if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { |
824 | + ret = boot_sector_init(disk); | |
825 | + if (ret) { | |
826 | + return ret; | |
827 | + } | |
828 | + | |
779 | 829 | qtest_add_func("acpi/piix4", test_acpi_piix4_tcg); |
780 | 830 | qtest_add_func("acpi/piix4/bridge", test_acpi_piix4_tcg_bridge); |
781 | 831 | qtest_add_func("acpi/q35", test_acpi_q35_tcg); |
@@ -7,21 +7,12 @@ | ||
7 | 7 | # |
8 | 8 | # Authors: |
9 | 9 | # Marcel Apfelbaum <marcel.a@redhat.com> |
10 | +# Igor Mammedov <imammedo@redhat.com> | |
10 | 11 | # |
11 | 12 | # This work is licensed under the terms of the GNU GPLv2. |
12 | 13 | # See the COPYING.LIB file in the top-level directory. |
13 | 14 | |
14 | -qemu= | |
15 | - | |
16 | -if [ -e x86_64-softmmu/qemu-system-x86_64 ]; then | |
17 | - qemu="x86_64-softmmu/qemu-system-x86_64" | |
18 | -elif [ -e i386-softmmu/qemu-system-i386 ]; then | |
19 | - qemu="i386-softmmu/qemu-system-i386" | |
20 | -else | |
21 | - echo "Run 'make' to build the qemu exectutable!" | |
22 | - echo "Run this script from the build directory." | |
23 | - exit 1; | |
24 | -fi | |
15 | +qemu_bins="x86_64-softmmu/qemu-system-x86_64" | |
25 | 16 | |
26 | 17 | if [ ! -e "tests/bios-tables-test" ]; then |
27 | 18 | echo "Test: bios-tables-test is required! Run make check before this script." |
@@ -29,6 +20,14 @@ if [ ! -e "tests/bios-tables-test" ]; then | ||
29 | 20 | exit 1; |
30 | 21 | fi |
31 | 22 | |
32 | -TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test | |
23 | +for qemu in $qemu_bins; do | |
24 | + if [ ! -e $qemu ]; then | |
25 | + echo "Run 'make' to build the following QEMU executables: $qemu_bins" | |
26 | + echo "Also, run this script from the build directory." | |
27 | + exit 1; | |
28 | + fi | |
29 | + TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test | |
30 | +done | |
31 | + | |
33 | 32 | |
34 | 33 | echo "The files were rebuilt and can be added to git." |
@@ -40,14 +40,14 @@ static uint32_t acpi_find_vgia(QTestState *qts) | ||
40 | 40 | g_assert_cmphex(rsdp_offset, <, RSDP_ADDR_INVALID); |
41 | 41 | |
42 | 42 | |
43 | - acpi_parse_rsdp_table(qts, rsdp_offset, rsdp_table); | |
43 | + acpi_fetch_rsdp_table(qts, rsdp_offset, rsdp_table); | |
44 | 44 | acpi_fetch_table(qts, &rsdt, &rsdt_len, &rsdp_table[16 /* RsdtAddress */], |
45 | - "RSDT", true); | |
45 | + 4, "RSDT", true); | |
46 | 46 | |
47 | 47 | ACPI_FOREACH_RSDT_ENTRY(rsdt, rsdt_len, ent, 4 /* Entry size */) { |
48 | 48 | uint8_t *table_aml; |
49 | 49 | |
50 | - acpi_fetch_table(qts, &table_aml, &table_length, ent, NULL, true); | |
50 | + acpi_fetch_table(qts, &table_aml, &table_length, ent, 4, NULL, true); | |
51 | 51 | if (!memcmp(table_aml + 16 /* OEM Table ID */, "VMGENID", 7)) { |
52 | 52 | uint32_t vgia_val; |
53 | 53 | uint8_t *aml = &table_aml[36 /* AML byte-code start */]; |