wepoll/src/epoll.c
2012-09-06 02:37:37 +02:00

609 lines
17 KiB
C

#include <stdio.h>
#include <assert.h>
#include <epoll.h>
#include "msafd.h"
#include "tree.h"
#define ARRAY_COUNT(a) (sizeof(a) / (sizeof((a)[0])))
#define EPOLL_KEY 0xE9011
typedef struct epoll_port_data_s epoll_port_data_t;
typedef struct epoll_op_s epoll_op_t;
typedef struct epoll_sock_data_s epoll_sock_data_t;
/* State associated with a epoll handle. */
struct epoll_port_data_s {
HANDLE iocp;
SOCKET peer_sockets[ARRAY_COUNT(AFD_PROVIDER_IDS)];
RB_HEAD(epoll_sock_data_tree, epoll_sock_data_s) sock_data_tree;
epoll_sock_data_t* attn;
size_t pending_ops_count;
};
/* State associated with a socket that is registered to the epoll port. */
typedef struct epoll_sock_data_s {
SOCKET sock;
SOCKET base_sock;
SOCKET peer_sock;
int op_generation;
int submitted_events;
int events;
int attn;
uint64_t user_data;
epoll_op_t* free_op;
epoll_sock_data_t* attn_prev;
epoll_sock_data_t* attn_next;
RB_ENTRY(epoll_sock_data_s) tree_entry;
};
/* State associated with a AFD_POLL request. */
struct epoll_op_s {
OVERLAPPED overlapped;
AFD_POLL_INFO poll_info;
int generation;
epoll_sock_data_t* sock_data;
};
int epoll_socket_compare(epoll_sock_data_t* a, epoll_sock_data_t* b) {
return a->sock - b->sock;
}
RB_GENERATE_STATIC(epoll_sock_data_tree, epoll_sock_data_s, tree_entry, epoll_socket_compare)
epoll_t epoll_create() {
HANDLE iocp;
epoll_port_data_t* port_data = malloc(sizeof *port_data);
if (port_data == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
return NULL;
}
iocp = CreateIoCompletionPort(INVALID_HANDLE_VALUE,
NULL,
0,
0);
if (iocp == INVALID_HANDLE_VALUE) {
free(port_data);
return NULL;
}
port_data->iocp = iocp;
port_data->attn = NULL;
port_data->pending_ops_count = 0;
memset(&port_data->peer_sockets, 0, sizeof port_data->peer_sockets);
RB_INIT(&port_data->sock_data_tree);
return (epoll_t) port_data;
}
static SOCKET epoll__create_peer_socket(HANDLE iocp,
WSAPROTOCOL_INFOW* protocol_info) {
SOCKET sock = 0;
sock = WSASocketW(protocol_info->iAddressFamily,
protocol_info->iSocketType,
protocol_info->iProtocol,
protocol_info,
0,
WSA_FLAG_OVERLAPPED);
if (sock == INVALID_SOCKET) {
return INVALID_SOCKET;
}
if (!SetHandleInformation((HANDLE) sock, HANDLE_FLAG_INHERIT, 0)) {
goto error;
};
if (CreateIoCompletionPort((HANDLE) sock,
iocp,
EPOLL_KEY,
0) == NULL) {
goto error;
}
return sock;
error:
closesocket(sock);
return INVALID_SOCKET;
}
static SOCKET epoll__get_peer_socket(epoll_port_data_t* port_data,
WSAPROTOCOL_INFOW* protocol_info) {
int index, i;
SOCKET peer_socket;
index = -1;
for (i = 0; i < ARRAY_COUNT(AFD_PROVIDER_IDS); i++) {
if (memcmp((void*) &protocol_info->ProviderId,
(void*) &AFD_PROVIDER_IDS[i],
sizeof protocol_info->ProviderId) == 0) {
index = i;
}
}
/* Check if the protocol uses an msafd socket. */
if (index < 0) {
SetLastError(ERROR_NOT_SUPPORTED);
return INVALID_SOCKET;
}
/* If we didn't (try) to create a peer socket yet, try to make one. Don't */
/* try again if the peer socket creation failed earlier for the same */
/* protocol. */
peer_socket = port_data->peer_sockets[index];
if (peer_socket == 0) {
peer_socket = epoll__create_peer_socket(port_data->iocp, protocol_info);
port_data->peer_sockets[index] = peer_socket;
}
return peer_socket;
}
int epoll__submit_poll_op(epoll_port_data_t* port_data, epoll_sock_data_t* sock_data) {
epoll_op_t* op;
int events;
DWORD result, afd_events;
op = sock_data->free_op;
events = sock_data->events;
/* epoll_ctl should ensure that there is a free op struct. */
assert(op != NULL);
/* These events should always be registered. */
assert(events & EPOLLERR);
assert(events & EPOLLHUP);
afd_events = AFD_POLL_ABORT | AFD_POLL_CONNECT_FAIL | AFD_POLL_LOCAL_CLOSE;
if (events & (EPOLLIN | EPOLLRDNORM))
afd_events |= AFD_POLL_RECEIVE | AFD_POLL_ACCEPT;
if (events & (EPOLLIN | EPOLLRDBAND))
afd_events |= AFD_POLL_RECEIVE_EXPEDITED;
if (events & (EPOLLOUT | EPOLLWRNORM | EPOLLRDBAND))
afd_events |= AFD_POLL_SEND | AFD_POLL_CONNECT;
op->generation = ++sock_data->op_generation;
op->sock_data = sock_data;
memset(&op->overlapped, 0, sizeof op->overlapped);
op->poll_info.Exclusive = TRUE;
op->poll_info.NumberOfHandles = 1;
op->poll_info.Timeout.QuadPart = INT64_MAX;
op->poll_info.Handles[0].Handle = (HANDLE) sock_data->base_sock;
op->poll_info.Handles[0].Status = 0;
op->poll_info.Handles[0].Events = afd_events;
result = afd_poll(sock_data->peer_sock,
&op->poll_info,
&op->overlapped);
if (result != 0) {
DWORD error = WSAGetLastError();
if (error != WSA_IO_PENDING) {
/* If this happens an error happened and no overlapped operation was */
/* started. */
return -1;
}
}
sock_data->free_op = NULL;
port_data->pending_ops_count++;
return 0;
}
int epoll_ctl(epoll_t port_handle, int op, SOCKET sock,
struct epoll_event* event) {
epoll_port_data_t* port_data;
port_data = (epoll_port_data_t*) port_handle;
switch (op) {
case EPOLL_CTL_ADD: {
epoll_sock_data_t* sock_data;
epoll_op_t* op;
SOCKET peer_sock;
WSAPROTOCOL_INFOW protocol_info;
int len;
/* Obtain protocol information about the socket. */
len = sizeof protocol_info;
if (getsockopt(sock,
SOL_SOCKET,
SO_PROTOCOL_INFOW,
(char*) &protocol_info,
&len) != 0) {
return -1;
}
peer_sock = epoll__get_peer_socket(port_data, &protocol_info);
if (peer_sock == INVALID_SOCKET) {
return -1;
}
sock_data = malloc(sizeof *sock_data);
if (sock_data == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
return -1;
}
op = malloc(sizeof *op);
if (op == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
free(sock_data);
return -1;
}
sock_data->sock = sock;
/* TODO: actually get base socket. */
sock_data->base_sock = sock;
sock_data->op_generation = 0;
sock_data->submitted_events = 0;
sock_data->events = event->events | EPOLLERR | EPOLLHUP;
sock_data->user_data = event->data.u64;
sock_data->peer_sock = peer_sock;
sock_data->free_op = op;
if (RB_INSERT(epoll_sock_data_tree, &port_data->sock_data_tree, sock_data) != NULL) {
/* Socket was already added. */
free(sock_data);
free(op);
SetLastError(ERROR_ALREADY_EXISTS);
return -1;
}
// Add to attention list
sock_data->attn = 1;
sock_data->attn_prev = NULL;
sock_data->attn_next = port_data->attn;
if (port_data->attn)
port_data->attn->attn_prev = sock_data;
port_data->attn = sock_data;
return 0;
}
case EPOLL_CTL_MOD: {
epoll_sock_data_t lookup;
epoll_sock_data_t* sock_data;
lookup.sock = sock;
sock_data = RB_FIND(epoll_sock_data_tree, &port_data->sock_data_tree, &lookup);
if (sock_data == NULL) {
/* Socket has not been registered with epoll instance. */
SetLastError(ERROR_NOT_FOUND);
return -1;
}
if (event->events & ~sock_data->submitted_events) {
if (sock_data->free_op == NULL) {
epoll_op_t* op = malloc(sizeof *op);
if (op == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
return -1;
}
sock_data->free_op = NULL;
}
// Add to attention list, if not already added.
if (!sock_data->attn) {
sock_data->attn_prev = NULL;
sock_data->attn_next = port_data->attn;
if (port_data->attn)
port_data->attn->attn_prev = sock_data;
port_data->attn = sock_data;
sock_data->attn = 1;
}
}
sock_data->events = event->events | EPOLLERR | EPOLLHUP;
sock_data->user_data = event->data.u64;
return 0;
}
case EPOLL_CTL_DEL: {
epoll_sock_data_t lookup;
epoll_sock_data_t* sock_data;
lookup.sock = sock;
sock_data = RB_FIND(epoll_sock_data_tree, &port_data->sock_data_tree, &lookup);
if (sock_data == NULL) {
/* Socket has not been registered with epoll instance. */
SetLastError(ERROR_NOT_FOUND);
return -1;
}
RB_REMOVE(epoll_sock_data_tree, &port_data->sock_data_tree, sock_data);
free(sock_data->free_op);
sock_data->events = -1;
/* Remove from attention list. */
if (sock_data->attn) {
if (sock_data->attn_prev != NULL)
sock_data->attn_prev->attn_next = sock_data->attn_next;
if (sock_data->attn_next != NULL)
sock_data->attn_next->attn_prev = sock_data->attn_prev;
if (port_data->attn == sock_data)
port_data->attn = sock_data->attn_next;
sock_data->attn = 0;
sock_data->attn_prev = sock_data->attn_next = NULL;
}
if (sock_data->submitted_events == 0) {
assert(sock_data->op_generation == 0);
free(sock_data);
} else {
/* There are still one or more ops pending. */
/* Wait for all pending ops to return before freeing. */
assert(sock_data->op_generation > 0);
}
return 0;
}
default:
WSASetLastError(WSAEINVAL);
return -1;
}
}
int epoll_wait(epoll_t port_handle, struct epoll_event* events, int maxevents, int timeout) {
epoll_port_data_t* port_data;
DWORD due;
DWORD gqcs_timeout;
port_data = (epoll_port_data_t*) port_handle;
/* Create overlapped poll operations for all sockets on the attention list. */
while (port_data->attn != NULL) {
epoll_sock_data_t* sock_data = port_data->attn;
assert(sock_data->attn);
/* Check if we need to submit another req. */
if (sock_data->events & EPOLL_EVENT_MASK & ~sock_data->submitted_events) {
int r = epoll__submit_poll_op(port_data, sock_data);
/* TODO: handle error. */
}
/* Remove from attention list */
port_data->attn = sock_data->attn_next;
sock_data->attn_prev = sock_data->attn_next = NULL;
sock_data->attn = 0;
}
port_data->attn = NULL;
/* Compute the timeout for GetQueuedCompletionStatus, and the wait end */
/* time, if the user specified a timeout other than zero or infinite. */
if (timeout > 0) {
due = GetTickCount() + timeout;
gqcs_timeout = (DWORD) timeout;
} else if (timeout == 0) {
gqcs_timeout = 0;
} else {
gqcs_timeout = INFINITE;
}
/* Dequeue completion packets until either at least one interesting event */
/* has been discovered, or the timeout is reached. */
do {
DWORD result, max_entries;
ULONG count, i;
OVERLAPPED_ENTRY entries[64];
int num_events = 0;
/* Compute how much overlapped entries can be dequeued at most. */
max_entries = ARRAY_COUNT(entries);
if ((int) max_entries > maxevents)
max_entries = maxevents;
result = GetQueuedCompletionStatusEx(port_data->iocp,
entries,
max_entries,
&count,
gqcs_timeout,
FALSE);
if (!result) {
DWORD error = GetLastError();
if (error == WAIT_TIMEOUT) {
return 0;
} else {
return -1;
}
}
port_data->pending_ops_count -= count;
/* Successfully dequeued overlappeds. */
for (i = 0; i < count; i++) {
OVERLAPPED* overlapped;
epoll_op_t* op;
epoll_sock_data_t* sock_data;
DWORD afd_events;
int registered_events, reported_events;
overlapped = entries[i].lpOverlapped;
op = CONTAINING_RECORD(overlapped, epoll_op_t, overlapped);
sock_data = op->sock_data;
if (op->generation < sock_data->op_generation) {
/* This op has been superseded. Free and ignore it. */
free(op);
continue;
}
/* Dequeued the most recent op. Reset generation and submitted_events. */
sock_data->op_generation = 0;
sock_data->submitted_events = 0;
sock_data->free_op = op;
registered_events = sock_data->events;
reported_events = 0;
/* Check if this op was associated with a socket that was removed */
/* with EPOLL_CTL_DEL. */
if (registered_events == -1) {
free(op);
free(sock_data);
continue;
}
/* Check for error. */
if (!NT_SUCCESS(overlapped->Internal)) {
struct epoll_event* ev = events + (num_events++);
ev->data.u64 = sock_data->user_data;
ev->events = EPOLLERR;
continue;
}
if (op->poll_info.NumberOfHandles == 0) {
/* NumberOfHandles can be zero if this poll operation was canceled */
/* due to a more recent exclusive poll operation. */
afd_events = 0;
} else {
afd_events = op->poll_info.Handles[0].Events;
}
/* Check for a closed socket. */
if (afd_events & AFD_POLL_LOCAL_CLOSE) {
free(op);
free(sock_data);
continue;
}
/* Convert afd events to epoll events. */
if (afd_events & (AFD_POLL_RECEIVE | AFD_POLL_ACCEPT))
reported_events |= (EPOLLIN | EPOLLRDNORM);
if (afd_events & AFD_POLL_RECEIVE_EXPEDITED)
reported_events |= (EPOLLIN | EPOLLRDBAND);
if (afd_events & AFD_POLL_SEND)
reported_events |= (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
if ((afd_events & AFD_POLL_DISCONNECT) && !(afd_events & AFD_POLL_ABORT))
reported_events |= (EPOLLRDHUP | EPOLLIN | EPOLLRDNORM | EPOLLRDBAND);
if (afd_events & AFD_POLL_ABORT)
reported_events |= EPOLLHUP | EPOLLERR;
if (afd_events & AFD_POLL_CONNECT)
reported_events |= (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
if (afd_events & AFD_POLL_CONNECT_FAIL)
reported_events |= EPOLLERR;
/* Don't report events that the user didn't specify. */
reported_events &= registered_events;
/* Unless EPOLLONESHOT is used or no events were reported that the */
/* user is interested in, add the socket back to the attention list. */
if (!registered_events & EPOLLONESHOT || reported_events == 0) {
assert(!sock_data->attn);
if (port_data->attn == NULL) {
sock_data->attn_next = sock_data->attn_prev = NULL;
port_data->attn = sock_data;
} else {
sock_data->attn_prev = NULL;
sock_data->attn_next = port_data->attn;
port_data->attn->attn_next = sock_data;
port_data->attn = sock_data;
}
}
if (reported_events) {
struct epoll_event* ev = events + (num_events++);
ev->data.u64 = sock_data->user_data;
ev->events = reported_events;
}
}
if (num_events > 0)
return num_events;
/* Events were dequeued, but none were relevant. Recompute timeout. */
if (timeout > 0) {
gqcs_timeout = due - GetTickCount();
}
} while (timeout > 0);
return 0;
}
int epoll_close(epoll_t port_handle) {
epoll_port_data_t* port_data;
epoll_sock_data_t* sock_data;
int i;
port_data = (epoll_port_data_t*) port_handle;
/* Close all peer sockets. This will make all pending ops return. */
for (i = 0; i < ARRAY_COUNT(port_data->peer_sockets); i++) {
SOCKET peer_sock = port_data->peer_sockets[i];
if (peer_sock != 0 && peer_sock != INVALID_SOCKET) {
if (closesocket(peer_sock) != 0)
return -1;
port_data->peer_sockets[i] = 0;
}
}
/* There is no list of epoll_ops the free. And even if there was, just */
/* freeing them would be dangerous since the kernel might still alter */
/* the overlapped status contained in them. But since we are sure that */
/* all ops will soon return, just await them all. */
while (port_data->pending_ops_count > 0) {
OVERLAPPED_ENTRY entries[64];
DWORD result;
ULONG count, i;
result = GetQueuedCompletionStatusEx(port_data->iocp,
entries,
ARRAY_COUNT(entries),
&count,
INFINITE,
FALSE);
if (!result) {
DWORD error = GetLastError();
return -1;
}
port_data->pending_ops_count -= count;
for (i = 0; i < count; i++) {
epoll_op_t* op = CONTAINING_RECORD(entries[i].lpOverlapped,
epoll_op_t,
overlapped);
free(op);
}
}
/* Remove all entries from the socket_state tree. */
while (sock_data = RB_ROOT(&port_data->sock_data_tree)) {
RB_REMOVE(epoll_sock_data_tree, &port_data->sock_data_tree, sock_data);
if (sock_data->free_op != NULL)
free(sock_data->free_op);
free(sock_data);
}
/* Close the I/O completion port. */
CloseHandle(port_data->iocp);
/* Finally, remove the port data. */
free(port_data);
return 0;
}