/* ** NGPT - Next Generation POSIX Threading ** Copyright (c) 2001 IBM Corporation ** Portions Copyright (c) 1999-2000 Ralf S. Engelschall ** ** This file is part of NGPT, a non-preemptive thread scheduling ** library which can be found at http://www.ibm.com/developer. ** ** This library is free software; you can redistribute it and/or ** modify it under the terms of the GNU Lesser General Public ** License as published by the Free Software Foundation; either ** version 2.1 of the License, or (at your option) any later version. ** ** This library is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** Lesser General Public License for more details. ** ** You should have received a copy of the GNU Lesser General Public ** License along with this library; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 ** USA. ** ** pth_native.c: Pth native thread handling. */ /* ``If you can't do it in ANSI C, it isn't worth doing.'' -- Unknown */ #include "pth_p.h" #include #include #ifndef CLONE_PARENT #define CLONE_PARENT 0x00008000 /* Same parent? */ #endif #ifndef CLONE_THREAD #define CLONE_THREAD 0x00010000 /* Same thread group? */ #endif #if cpp /* Watchdog timer interval (secs), determines how often the watchdog wakes up. */ #ifndef WATCHDOG_TIMER_INTERVAL #define WATCHDOG_TIMER_INTERVAL 2 #endif /* After MAX_SPIN_COUNT iterations, we put the calling thread to sleep. */ #ifndef MAX_SPIN_COUNT #define MAX_SPIN_COUNT 50 #endif /* * Duration of sleep (in nanoseconds) when we can't acquire a spinlock * after MAX_SPIN_COUNT. */ #ifndef SPIN_SLEEP_DURATION #define SPIN_SLEEP_DURATION 2000001 #endif #if PTH_NEED_SEPARATE_REGISTER_STACK > 0 extern int __clone2(int (*fn)(void *arg), void *thread_bottom, size_t stack_size, int flags, void *arg); #endif struct pth_descr_st { int is_used; /* 1 is descr is used, 0 if not */ pid_t pid; /* pid of native thread */ pid_t tid; /* tid of native thread */ size_t stacksize; /* stack size */ char *true_stack; /* the "true" un-twiddled stack pointer */ char *stack; /* the stack passed to clone */ pth_t sched; /* scheduler for this thread */ pth_t current; /* the current thread on this native */ pth_t nexttimer_thread; /* the timer thread this task is waiting on */ int sched_index; /* the index of this descriptor in table */ int ptrfixed; /* 1 if ptr was adjusted, 0 otherwise */ int is_bound; /* 1 if thread is bound, 0 otherwise */ int sigpipe[2]; /* internal signal occurence pipe */ sigset_t sigpending; /* mask of pending signals */ sigset_t sigblock; /* mask of signals we block in scheduler */ sigset_t sigcatch; /* mask of signals we have to catch */ sigset_t sigraised; /* mask of raised signals */ #if PTH_MCTX_MTH(sjlj) &&\ !PTH_MCTX_DSP(sjljlx) &&\ !PTH_MCTX_DSP(sjljisc) &&\ !PTH_MCTX_DSP(sjljw32) jmp_buf *mctx_trampoline; /* trampoline context */ pth_mctx_t mctx_caller; /* trampoline caller */ sig_atomic_t mctx_called; /* whether the trampoline has been called */ pth_mctx_t *mctx_creating; /* the context of the creator */ void (*mctx_creating_func)(void); /* the function to be called after creation */ sigset_t mctx_creating_sigs; /* the signals used during creation */ #endif char *stack_top; /* the bottom of the stack */ }; #endif /* cpp */ intern int pth_watchdog_enabled = FALSE; static struct pth_descr_st pth_watchdog_descr; #ifdef PTH_HAVE_KERNEL_PATCH intern pid_t gettid(void); inline _syscall0(pid_t,gettid); intern int tkill(pid_t pid, int sig); inline int tkill(pid_t pid, int sig) { return syscall(__NR_tkill, pid, sig); } #endif intern pth_descr_t pth_alloc_native(int create_stack, int is_watchdog) { pth_descr_t descr = (is_watchdog) ? &(pth_watchdog_descr) : &(pth_native_list[pth_number_of_natives++]); char *stack = NULL; size_t pagesize = getpagesize(); size_t stack_granularity; /* * Since part of this structure is configurable, we're gonna * initialize the whole thing to 0x0 to ensure no problems later on... */ memset(descr, 0x0, sizeof(struct pth_descr_st)); descr->is_used = TRUE; descr->sched_index = pth_number_of_natives - 1; #if PTH_MCTX_MTH(sjlj) &&\ !PTH_MCTX_DSP(sjljlx) &&\ !PTH_MCTX_DSP(sjljisc) &&\ !PTH_MCTX_DSP(sjljw32) if (!is_watchdog) { descr->mctx_trampoline = mmap(NULL, sizeof(jmp_buf), PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (descr->mctx_trampoline == MAP_FAILED) { if (!is_watchdog) pth_number_of_natives--; descr->is_used = FALSE; return_errno(NULL, ENOMEM); } } #endif /* If we're not creating a stack, just return now... */ if (!create_stack) return descr; /* Create the native stack... */ #if PTH_NEED_SEPARATE_REGISTER_STACK > 0 stack_granularity = 2*pagesize; #else stack_granularity = pagesize; #endif /* Stack size is 64 * the page granularity plus 8 to allow alignment... */ descr->stacksize = (64*stack_granularity) + 8; /* Allocate the stack... */ stack = mmap(NULL, descr->stacksize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (stack == MAP_FAILED) { if (!is_watchdog) pth_number_of_natives--; descr->is_used = FALSE; return_errno(NULL, ENOMEM); } /* Save the unmodified ptr before we align it... free will use this later... */ descr->true_stack = stack; /* Align it if necessary... */ if ((long)stack % 16L != 0L) stack = ((char *)(stack + 8)); /* Save the possibly modified stack ptr... */ descr->stack = stack; /* Calculate the stack bottom... */ descr->stack_top = ((char *)(stack + descr->stacksize)); return descr; } intern pid_t pth_new_native(void) { int flags = CLONE_VM | CLONE_FS | CLONE_SIGHAND | CLONE_FILES | CLONE_THREAD; pid_t native_pid; pth_descr_t descr = NULL; pth_acquire_lock(&pth_native_lock); if ((descr = pth_alloc_native(TRUE, FALSE)) == NULL) { pth_release_lock(&pth_native_lock); return -1; } pth_release_lock(&pth_native_lock); /* Ready to clone... */ #if PTH_NEED_SEPARATE_REGISTER_STACK > 0 /* * clone2() takes slightly different parameters than clone, such the stack bottom * rather than the stack top and the stack size. This is actually easier to understand * from a programming standpoint but it is not available on all versions of the Linux * kernel. */ native_pid = __clone2(pth_new_scheduler, descr->stack, descr->stacksize, flags, descr); #else /* * Note: clone() take the *top* of the stack as it's parameter. */ native_pid = clone(pth_new_scheduler, descr->stack_top, flags, descr); #endif if (native_pid == -1) { pth_number_of_natives--; #if PTH_MCTX_MTH(sjlj) &&\ !PTH_MCTX_DSP(sjljlx) &&\ !PTH_MCTX_DSP(sjljisc) &&\ !PTH_MCTX_DSP(sjljw32) munmap(descr->mctx_trampoline, sizeof(jmp_buf)); #endif munmap(descr->true_stack, descr->stacksize); descr->is_used = FALSE; } /* Return the native pid... */ return native_pid; } intern pid_t pth_new_watchdog(void) { int flags = CLONE_VM | CLONE_FS | CLONE_SIGHAND | CLONE_FILES | CLONE_THREAD; pid_t native_pid; pth_descr_t descr = NULL; if ((descr = pth_alloc_native(TRUE, TRUE)) == NULL) return -1; pth_watchdog_enabled = TRUE; /* Ready to clone... */ #if PTH_NEED_SEPARATE_REGISTER_STACK > 0 /* * clone2() takes slightly different parameters than clone, such the stack bottom * rather than the stack top and the stack size. This is actually easier to understand * from a programming standpoint but it is not available on all versions of the Linux * kernel. */ native_pid = __clone2(pth_watchdog, descr->stack, descr->stacksize, flags, descr); #else /* * Note: clone() take the *top* of the stack as it's parameter. */ native_pid = clone(pth_watchdog, descr->stack_top, flags, descr); #endif if (native_pid == -1) { munmap(descr->true_stack, descr->stacksize); descr->is_used = FALSE; } /* Return the native pid... */ return native_pid; } intern void pth_drop_natives(void) { int slot = 0; /* Signal the watchdog to terminate... */ pth_watchdog_enabled = FALSE; /* * For each slot in the native list we need to interate * over the list, in the process calling * cleanup on the native thread... */ do { pth_cleanup_native(slot); } while (pth_native_list[++slot].is_used == TRUE); /* Finish watchdog termination... */ if (pth_watchdog_descr.is_used) { tkill(pth_watchdog_descr.tid, SIGKILL); munmap(pth_watchdog_descr.true_stack, pth_watchdog_descr.stacksize); pth_watchdog_descr.is_used = FALSE; } } intern void pth_cleanup_native(int slot) { /* * Here we cleanup the native thread, killing it and * freeing the scheduler and the encompassing descriptor. * We also need to set the entry in the list to NULL... * * Note: We don't free the stack for the native, this * is done for us when we issue the kill(). */ if (pth_native_list[slot].is_used) { pth_debug3("pth_cleanup_native: cleaning up pid %d, tid %d", pth_native_list[slot].pid, pth_native_list[slot].tid); /* Clean up the scheduler */ pth_tcb_free(pth_native_list[slot].sched); /* Now close the signal pipe... */ close(pth_native_list[slot].sigpipe[0]); close(pth_native_list[slot].sigpipe[1]); /* * Free clone stack... * an' finally, clean out the table entry... */ if (slot != 0) { #if PTH_MCTX_MTH(sjlj) &&\ !PTH_MCTX_DSP(sjljlx) &&\ !PTH_MCTX_DSP(sjljisc) &&\ !PTH_MCTX_DSP(sjljw32) munmap(pth_native_list[slot].mctx_trampoline, sizeof(jmp_buf)); #endif munmap(pth_native_list[slot].true_stack, pth_native_list[slot].stacksize); pth_native_list[slot].is_used = FALSE; } /* First kill the native... If not slot 0 ;-) */ if (slot != 0 && pth_native_list[slot].tid != 0) tkill(pth_native_list[slot].tid, SIGKILL); } return; } intern void pth_dumpnatives(FILE *fp) { int n = 0; int i = 1; fprintf(fp, "| Native Thread Queue:\n"); for (n = 0; n < PTH_MAX_SCHEDULERS; n++) { if (pth_native_list[n].is_used == FALSE) break; fprintf(fp, "| %d. native thread 0x%lx pid = %d, tid = %d\n", i++, (unsigned long)&pth_native_list[n], pth_native_list[n].pid, pth_native_list[n].tid); } if (pth_watchdog_descr.is_used) fprintf(fp, "| %d. native thread 0x%lx pid = %d, tid = %d (WATCHDOG)\n", i++, (unsigned long)&pth_watchdog_descr, pth_watchdog_descr.pid, pth_watchdog_descr.tid); return; } intern int pth_native_yield(void) { return sched_yield(); } intern int pth_watchdog(void *arg) { int i; char c = (int)1; struct timespec tm; pth_descr_t descr = NULL; pth_debug1("pth_watchdog: starting new watchdog"); /* set the tid of this watchdog... */ descr = (pth_descr_t)arg; descr->pid = getpid(); descr->tid = gettid(); descr->is_bound = 1; while (pth_watchdog_enabled == TRUE) { tm.tv_sec = WATCHDOG_TIMER_INTERVAL; tm.tv_nsec = 0; nanosleep(&tm, NULL); pth_debug2("pth_watchdog: awake, tid = %d", descr->tid); if (pth_active_threads() > 1) { pth_debug1("pth_watchdog: awake, work to be done"); for (i = 0; pth_native_list[i].is_used; i++) { if (pth_native_list[i].is_bound == FALSE) pth_sc(write)(pth_native_list[i].sigpipe[1], c, sizeof(char)); } } } pth_debug1("pth_watchdog: exiting"); return 0; } intern int pth_new_scheduler(void *arg) { pth_attr_t t_attr; pth_descr_t descr = NULL; pth_debug1("pth_new_scheduler: starting new scheduler"); /* set the tid of this scheduler... */ descr = (pth_descr_t)arg; pth_acquire_lock(&pth_native_lock); descr->pid = getpid(); descr->tid = gettid(); descr->is_bound = 1; pth_release_lock(&pth_native_lock); /* create the internal signal pipe for this scheduler... */ if (pipe(descr->sigpipe) == -1) { fprintf(stderr, "**Pth** INIT: Cannot create internal pipe: %s\n", strerror(errno)); abort(); } pth_fdmode(descr->sigpipe[0], PTH_FDMODE_NONBLOCK); pth_fdmode(descr->sigpipe[1], PTH_FDMODE_NONBLOCK); /* spawn the scheduler thread that will schedule on the new thread */ t_attr = pth_attr_new(); if (t_attr == NULL) return FALSE; pth_attr_set(t_attr, PTH_ATTR_PRIO, PTH_PRIO_MAX); pth_attr_set(t_attr, PTH_ATTR_NAME, "**SCHEDULER**"); pth_attr_set(t_attr, PTH_ATTR_JOINABLE, FALSE); pth_attr_set(t_attr, PTH_ATTR_CANCEL_STATE, PTH_CANCEL_DISABLE); pth_attr_set(t_attr, PTH_ATTR_STACK_SIZE, 64*1024); pth_attr_set(t_attr, PTH_ATTR_STACK_ADDR, NULL); descr->sched = pth_spawn(t_attr, pth_scheduler, NULL); if (descr->sched == NULL) { errno_shield { pth_attr_destroy(t_attr); } return FALSE; } descr->sched->lastrannative = descr->tid; pth_debug2("pth_new_scheduler: scheduler started, pid = %i. Transferring control to it...", descr->sched->lastrannative); /* Make the scheduler the current thread for this native... */ pth_set_current(descr->sched); /* switch context to this new scheduler... */ pth_mctx_restore(&descr->sched->mctx); pth_debug2("pth_new_scheduler: came back from switch to scheduler!!!! Errno = %i", errno); /*NOTREACHED*/ return 0; } intern inline int pth_testandset(int * spinlock) { int ret; #ifndef __s390__ #if PTH_NEED_SEPARATE_REGISTER_STACK > 0 /* IA64 */ __asm__ __volatile__( "xchg4 %0=%1, %2" : "=r"(ret), "=m"(*spinlock) : "0"(1), "m"(*spinlock) : "memory" ); #else /* IA32 */ __asm__ __volatile__( "lock; xchgl %0, %1" : "=r"(ret), "=m"(*spinlock) : "0"(1), "m"(*spinlock) : "memory" ); #endif /* IA64/32 */ #else __asm__ __volatile__( " la 1,1\n" " la 2,%2\n" " slr 0,0\n" " cs 0,1,0(2)\n" " lr %1,0\n" : "=m" (*spinlock), "=r" (ret) : "m" (*spinlock) : "0", "1", "2"); #endif return ret; } intern void pth_release_lock(pth_qlock_t * spinlock) { spinlock->count--; if (spinlock->count == 0) { spinlock->owner = 0; spinlock->lock = 0; } } intern void pth_acquire_lock(pth_qlock_t * spinlock) { int cnt = 0; struct timespec tm; int tid = gettid(); /* Already have the lock? */ if (tid == spinlock->owner) { spinlock->count++; return; } while (pth_testandset(&(spinlock->lock))) { if (cnt < MAX_SPIN_COUNT) { cnt++; } else { tm.tv_sec = 0; tm.tv_nsec = SPIN_SLEEP_DURATION; nanosleep(&tm, NULL); cnt = 0; } } spinlock->count = 1; spinlock->owner = tid; }