epoll - I/O event notification facility

最近要寫一支 Network Service Daemon 的程式,本來想用 select 來寫,後來印象中有看過其他的解決方案,所以就稍微查了一下,於是就發現了今天的主角 ~ epoll。epoll 是一套在 Linux 2.6上推出的 I/O Event 機制(其實是2.5.44,但因為我只看穩定版的關係,所以直接寫2.6)。

select 或是 poll 機制比起來,epoll 最大的優點是它只會對Active的FD進行操作。什麼叫作只會對 Active 的 FD 進行操作?回想一下在寫 select 的程式時,總是會寫一個迴圈去掃描所有 FDSET 裏面的成員,去找到到底是哪一個 FD 有事件發生,而在epoll 裏面就不會這樣,它只會回傳有在運作的FD_SUBSET,實作的方式以後再研究吧,不過聽說是根據 FD 的 callback 函式有沒有被呼叫進行判別。其他的優點像是避開了select 的 FD_SETSIZE 限制之類的就不提了,因為感覺上這個參數是可以被改大的。

epoll 的函式只有三個:

int epoll_create ( int size );
int epoll_ctl ( int epfd, int op, int fd, struct epoll_event *event );
int epoll_wait ( int epfd, struct epoll_event *events, int maxevents, int timeout );

看名字並不難理解,所以就不加說明了,有問題就看下面的範例吧。下面是一支使用 epoll 的 TCP Echo Server 程式。Client 端就不附上了,反正改改也就是了。
#include <stdio.h>          // for printf() and fprintf() 
#include <sys/socket.h>     // for socket(), bind(), and connect() 
#include <arpa/inet.h>      // for sockaddr_in and inet_ntoa() 
#include <stdlib.h>         // for atoi() and exit() 
#include <string.h>         // for memset() 
#include <unistd.h>         // for close() 
#include <sys/time.h>       // for struct timeval {} 
#include <fcntl.h>          // for fcntl() 

#include <sys/epoll.h>

#define MAXPENDING 5        // Maximum outstanding connection requests
#define MAXCLIENT  100      // Maximum client connections
#define RCVBUFSIZE 1024     // Size of receive buffer 

int     CreateTCPServerSocket( unsigned short );
int     AcceptTCPConnection( int );
int     HandleTCPClient( int );
int     IsFDSet( int, int [], int );

int main( int argc, char *argv[] )
{
    int             *servSock;        // Socket descriptors for server 
    long            timeout;          // Timeout value given on command-line 
    
    int cliSock[MAXCLIENT];           // Client Socket Set
    
    int running = 1;                 // 1 if server should be running; 0 otherwise 
    
    int noPorts;                     // Number of port specified on command-line 
    int port;                        // Looping variable for ports 
    unsigned short portNo;           // Actual port number 
    
    int i;                           // For loop use
    
    int                     epfd;                   // EPOLL File Descriptor. 
    struct epoll_event      ev;                     // Used for EPOLL.
    struct epoll_event      events[120];            // Used for EPOLL.
    int                     noEvents;               // EPOLL event number.

    
    // Test for correct number of arguments
    
    if ( argc < 3 )     
    {
        fprintf( stderr, "Usage:  %s <Timeout (secs.)> <Port 1> ...\n", argv[0]);
        exit(1);
    }

    timeout = atol(argv[1]);        // First arg: Timeout 
    noPorts = argc - 2;             // Number of ports is argument count minus 2 

    // Allocate list of sockets for incoming connections 
    servSock = (int *) malloc( noPorts * sizeof(int) );    
  
    // Create epoll file descriptor.
    // MAXCLIENT + noPorts = MAXCLIENT + Bind Socket.
    epfd = epoll_create( MAXCLIENT + noPorts );
    
    // Create list of ports and sockets to handle ports 
    for ( port = 0; port < noPorts; port++ )
    {
        // Add port to port list, skip first two arguments 
        portNo = atoi( argv[port + 2] ); 

        // Create port socket 
        servSock[port] = CreateTCPServerSocket( portNo );
        
        // Add to the epoll
        ev.data.fd = servSock[port];
        ev.events = EPOLLIN | EPOLLOUT | EPOLLET;
        epoll_ctl( epfd, EPOLL_CTL_ADD, servSock[port], &ev );        
    }
    
    // Add STDIN into the EPOLL set.
    ev.data.fd = STDIN_FILENO;
    ev.events = EPOLLIN | EPOLLET;
    epoll_ctl( epfd, EPOLL_CTL_ADD, STDIN_FILENO, &ev );     
    
    // Initialize the client socket pool
    for( i = 0 ; i < MAXCLIENT ; i++ )
    {
        cliSock[i] = -1;
    }
    
    printf( "Starting server:  Hit return to shutdown\n" );
    while ( running )
    {
        // Wait for events.
        // int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
        // Specifying a timeout of -1 makes epoll_wait() wait indefinitely.
        noEvents = epoll_wait( epfd, events, FD_SETSIZE , ( timeout * 1000 ) );
        
        if ( noEvents <= 0 ) 
        {
            printf("No echo requests for %ld secs...Server still alive\n", timeout);
            continue; 
        }
        
        for( i = 0 ; i < noEvents; i++ )
        {
            if( events[i].events & EPOLLIN && STDIN_FILENO == events[i].data.fd )
            {
                printf("Shutting down server\n");
                getchar();
                running = 0;
                continue;
            }
            else if( events[i].events & EPOLLIN && ( port = IsFDSet( events[i].data.fd, servSock, noPorts ) ) >= 0 )
            {
                printf("Request on port %d:  ", port);
                for( i = 0 ; i < MAXCLIENT ; i++ )
                {
                    if( cliSock[i] < 0 ) 
                    {
                        cliSock[i] = AcceptTCPConnection( servSock[port] );                        
                        
                        // Add the client socket to the epoll fdset. 
                        ev.data.fd = cliSock[i];
                        ev.events = EPOLLIN | EPOLLET;
                        epoll_ctl( epfd, EPOLL_CTL_ADD, cliSock[i], &ev ); 
                        
                        i = MAXCLIENT;
                    }
                }
            }
            else if ( events[i].events & EPOLLIN )
            {
                port = IsFDSet( events[i].data.fd, cliSock, MAXCLIENT );
                
                if( port >= 0 )
                {                
                    if( HandleTCPClient( cliSock[port] ) == 0 )
                    {
                        printf( "Connection %d Shudown.\n", cliSock[i] );
                    
                        // We do not need to del fd from the epfd set.
                        // Close the fd will remove it from epfd automatically.
                        
                        close( cliSock[port] );
                        cliSock[port] = -1;
                    }
                }                
            }           
        }
    }

    // Close sockets 
    for ( port = 0; port < noPorts; port++ )
    {
        close( servSock[port] );
    }
    
    for ( i = 0; i < MAXCLIENT; i++ )
    {
        if( cliSock[i] > 0 )
        {
            close( cliSock[i] );
        }
    }
    
    // Free list of sockets 
    close( *servSock );
    free( servSock );
    close( epfd );

    return 0;
}

int CreateTCPServerSocket(unsigned short port)
{
    int sock;                        // socket to create 
    struct sockaddr_in echoServAddr; // Local address 

    // Create socket for incoming connections 
    if ( ( sock = socket( PF_INET, SOCK_STREAM, IPPROTO_TCP ) ) < 0 )
    {
        perror( "socket() failed" );
        exit(1);
    }
      
    // Construct local address structure 
    memset( &echoServAddr, 0, sizeof( echoServAddr ) );     // Zero out structure 
    echoServAddr.sin_family = AF_INET;                      // Internet address family 
    echoServAddr.sin_addr.s_addr = htonl( INADDR_ANY );     // Any incoming interface 
    echoServAddr.sin_port = htons( port );                  // Local port 

    // Bind to the local address 
    if ( bind(sock, (struct sockaddr *) &echoServAddr, sizeof( echoServAddr ) ) < 0 )
    {
        perror( "bind() failed" );
        exit(1);
    }
    
    // Mark the socket so it will listen for incoming connections 
    if ( listen( sock, MAXPENDING ) < 0 )
    {
        perror( "listen() failed" );
        exit(1);
    }
    
    return sock;
}

int AcceptTCPConnection( int servSock )
{
    int                 clntSock;     // Socket descriptor for client 
    struct sockaddr_in  echoClntAddr; // Client address 
    unsigned int        clntLen;      // Length of client address data structure 

    // Set the size of the in-out parameter 
    clntLen = sizeof( echoClntAddr );
    
    // Wait for a client to connect 
    if ( ( clntSock = accept( servSock, (struct sockaddr *) &echoClntAddr, &clntLen ) ) < 0 )
    {
        perror("accept() failed");
        exit(1);
    }
    
    // clntSock is connected to a client! 
    
    printf("Handling client %s(%d)\n", inet_ntoa( echoClntAddr.sin_addr ), clntSock );

    return clntSock;
}

int HandleTCPClient( int clntSocket )
{
    char    echoBuffer[RCVBUFSIZE];        // Buffer for echo string 
    int     recvMsgSize;                   // Size of received message 
    
    bzero( echoBuffer, RCVBUFSIZE );
    
    // Receive message from client 
    if ( ( recvMsgSize = recv( clntSocket, echoBuffer, RCVBUFSIZE, 0 ) ) < 0 )
    {
        perror("recv() failed");
        exit(1);
    }

    // Send received string and receive again until end of transmission 
    if ( recvMsgSize > 0 )      // zero indicates end of transmission 
    {
        printf( "Recv(%d): %s\n", recvMsgSize, echoBuffer );
        
        // Echo message back to client 
        if ( send( clntSocket, echoBuffer, recvMsgSize, 0) != recvMsgSize )
        {
            perror( "send() failed" );
            exit(1);
        }
    }
    
    return recvMsgSize;
}

int IsFDSet( int fd, int fdset[], int setno )
{
    int i = 0;
    
    for( i = 0 ; i < setno ; i++ )
    {
        if( fd == fdset[i] )
        {
            return i;
        }
    }
    
    return -1;
}

留言

這個網誌中的熱門文章

如何將Linux打造成OpenFlow Switch:Openvswitch

我弟家的新居感恩禮拜分享:善頌善禱

Linux Virtual Interface: TUN/TAP