diff --git a/pyda_core/pyda_core.c b/pyda_core/pyda_core.c
index 4c4fc9e..59e5ac4 100644
--- a/pyda_core/pyda_core.c
+++ b/pyda_core/pyda_core.c
@@ -34,10 +34,9 @@ pyda_process* pyda_mk_process() {
 
     pyda_process *proc = dr_global_alloc(sizeof(pyda_process));
     proc->refcount = 0; // xxx: will be incremented to 1 by first pyda_mk_thread
-    proc->dirty_hooks = 0;
+    proc->flush_count = 0;
     drvector_init(&proc->threads, 0, true, NULL);
     drvector_init(&proc->thread_run_untils, 0, true, NULL);
-    drvector_init(&proc->hook_delete_queue, 0, true, NULL);
 
     proc->main_thread = pyda_mk_thread(proc);
     hashtable_init_ex(&proc->callbacks, 4, HASH_INTPTR, false, false, free_hook, NULL, NULL);
@@ -214,7 +213,10 @@ pyda_thread* pyda_mk_thread(pyda_process *proc) {
     thread->run_until = 0;
     thread->signal = 0;
     thread->dirty_run_until = 0;
+    thread->flush_ts = proc->flush_count;
+
     drvector_init(&thread->context_stack, 0, true, free_context);
+    drvector_init(&thread->hook_update_queue, 0, true, NULL);
 
     drvector_append(&proc->threads, thread);
     drvector_append(&proc->thread_run_untils, NULL);
@@ -285,7 +287,7 @@ void pyda_thread_destroy_last(pyda_thread *t) {
 void pyda_yield_noblock(pyda_thread *t) {
     t->python_yielded = 1;
     pthread_mutex_lock(&t->mutex);
-    pthread_cond_signal(&t->resume_cond);
+    pthread_cond_broadcast(&t->resume_cond);
     pthread_mutex_unlock(&t->mutex);
 }
 
@@ -295,10 +297,9 @@ void pyda_yield(pyda_thread *t) {
     t->yield_count++;
 
     // here we wait for the executable to signal
-    // dr_set_safe_for_sync(false);
 
     pthread_mutex_lock(&t->mutex);
-    pthread_cond_signal(&t->resume_cond);
+    pthread_cond_broadcast(&t->resume_cond);
 
     while (!t->app_yielded)
         pthread_cond_wait(&t->break_cond, &t->mutex);
@@ -306,13 +307,12 @@ void pyda_yield(pyda_thread *t) {
     t->app_yielded = 0;
     pthread_mutex_unlock(&t->mutex);
 
-    // dr_set_safe_for_sync(true);
 }
 
 void pyda_break_noblock(pyda_thread *t) {
     t->app_yielded = 1;
     pthread_mutex_lock(&t->mutex);
-    pthread_cond_signal(&t->break_cond);
+    pthread_cond_broadcast(&t->break_cond);
     pthread_mutex_unlock(&t->mutex);
 }
 
@@ -320,29 +320,31 @@ void pyda_break_noblock(pyda_thread *t) {
 void pyda_break(pyda_thread *t) {
     t->app_yielded = 1;
 
-    // here we wait for the python to signal
-    // dr_set_safe_for_sync(false);
+    // Hack to tell dynamorio that dr_flush_region on another thread is OK
+    // here -- this is not REALLY safe per the docs but we use
+    // dr_redirect_execution so we *should* always return to a valid fragment...
+    dr_mark_safe_to_suspend(dr_get_current_drcontext(), true);
 
+    // here we wait for the python to signal
     pthread_mutex_lock(&t->mutex);
-    pthread_cond_signal(&t->break_cond);
+    pthread_cond_broadcast(&t->break_cond);
 
     while (!t->python_yielded)
         pthread_cond_wait(&t->resume_cond, &t->mutex);
 
+    dr_mark_safe_to_suspend(dr_get_current_drcontext(), false);
+
     t->python_yielded = 0;
     pthread_mutex_unlock(&t->mutex);
-    // dr_set_safe_for_sync(true);
 }
 
 void pyda_initial_break(pyda_thread *t) {
     // lock is already held
-    // dr_set_safe_for_sync(false);
     while (!t->python_yielded)
         pthread_cond_wait(&t->resume_cond, &t->mutex);
 
     t->python_yielded = 0;
     pthread_mutex_unlock(&t->mutex);
-    // dr_set_safe_for_sync(true);
 }
 
 PyObject *pyda_run_until(pyda_thread *proc, uint64_t addr) {
@@ -350,12 +352,12 @@ PyObject *pyda_run_until(pyda_thread *proc, uint64_t addr) {
     return NULL;
 }
 
-void pyda_add_hook(pyda_process *t, uint64_t addr, PyObject *callback) {
+void pyda_add_hook(pyda_process *p, uint64_t addr, PyObject *callback) {
     pyda_hook *cb = dr_global_alloc(sizeof(pyda_hook));
     cb->py_func = callback;
 
     Py_INCREF(callback);
-    DEBUG_PRINTF("pyda_add_hook %p %p\n", cb, cb->py_func);
+    DEBUG_PRINTF("pyda_add_hook %p %p for %llx\n", cb, cb->py_func, addr);
 
     cb->callback_type = 0;
     cb->addr = (void*)addr;
@@ -365,19 +367,22 @@ void pyda_add_hook(pyda_process *t, uint64_t addr, PyObject *callback) {
     // dr_where_am_i_t whereami = dr_where_am_i(drcontext, (void*)addr, NULL);
     // DEBUG_PRINTF("Hook is in %lu\n", whereami);
 
-    if (!hashtable_add(&t->callbacks, (void*)addr, cb)) {
+    if (!hashtable_add(&p->callbacks, (void*)addr, cb)) {
         dr_global_free(cb, sizeof(pyda_hook));
         dr_fprintf(STDERR, "Failed to add hook at %p\n", (void*)addr);
         dr_abort();
     }
 
-    t->dirty_hooks = 1;
+    
+    pyda_thread *t = pyda_thread_getspecific(g_pyda_tls_idx);
+    drvector_append(&t->hook_update_queue, (void*)addr);
 }
 
 void pyda_remove_hook(pyda_process *p, uint64_t addr) {
     hashtable_remove(&p->callbacks, (void*)addr);
-    p->dirty_hooks = 1;
-    drvector_append(&p->hook_delete_queue, (void*)addr);
+
+    pyda_thread *t = pyda_thread_getspecific(g_pyda_tls_idx);
+    drvector_append(&t->hook_update_queue, (void*)addr);
 }
 
 void pyda_set_thread_init_hook(pyda_process *p, PyObject *callback) {
@@ -410,17 +415,8 @@ void pyda_set_syscall_post_hook(pyda_process *p, PyObject *callback) {
     Py_INCREF(callback);
 }
 
-static void flush_hook(void *hook) {
-    pyda_hook *cb = (pyda_hook*)hook;
-    if (cb->callback_type == 0) {
-        DEBUG_PRINTF("dr_flush_region: %llx\n", (void*)cb->addr);
-        dr_flush_region((void*)cb->addr, 1);
-        DEBUG_PRINTF("dr_flush_region end\n");
-    }
-
-}
-
 int pyda_flush_hooks() {
+    void *drcontext = dr_get_current_drcontext();
     pyda_thread *t = pyda_thread_getspecific(g_pyda_tls_idx);
     pyda_process *p = t->proc;
 
@@ -432,20 +428,42 @@ int pyda_flush_hooks() {
         flushed = 1;
     }
 
-    if (p->dirty_hooks) {
-        hashtable_apply_to_all_payloads(&p->callbacks, flush_hook);
-        p->dirty_hooks = 0;
+    if (t->hook_update_queue.entries) {
         flushed = 1;
 
+        // Copy to temporary, alternate storage so we don't hold the lock
+        int entry_count = t->hook_update_queue.entries;
+        void **tmp = dr_thread_alloc(drcontext, sizeof(void*) * entry_count);
+        if (!tmp) {
+            dr_fprintf(STDERR, "dr_thread_alloc failed");
+            dr_abort();
+        }
+        memcpy(tmp, t->hook_update_queue.array, sizeof(void*) * entry_count);
+        t->hook_update_queue.entries = 0;
+
         // Flush deleted hooks
-        drvector_lock(&p->hook_delete_queue);
-        for (int i=0; i<p->hook_delete_queue.entries; i++) {
-            dr_flush_region((void*)p->hook_delete_queue.array[i], 1);
+        for (int i=0; i<entry_count; i++) {
+            DEBUG_PRINTF("dr_flush_region: %llx\n", tmp[i]);
+            dr_flush_region(tmp[i], 1);
+            DEBUG_PRINTF("dr_flush_region end %llx\n", tmp[i]);
+
+            // race lol
+            p->flush_count++;
         }
-        p->hook_delete_queue.entries = 0;
-        drvector_unlock(&p->hook_delete_queue);
+
+        dr_thread_free(drcontext, tmp, sizeof(void*) * entry_count);
+    }
+
+    if (t->flush_ts != p->flush_count) {
+        // Require that dr_redirect_execution is used, since another thread may have flushed
+        // us during the dr_mark_safe_to_suspend section in thread_prepare_for_python_entry
+        //
+        // note: right now other threads cannot flush us
+        flushed = 1;
+        t->flush_ts = p->flush_count;
     }
 
+
     return flushed;
 }
 pyda_hook* pyda_get_callback(pyda_process *p, void* addr) {
@@ -481,14 +499,22 @@ void pyda_clear_run_until(pyda_thread *t) {
 
 int pyda_check_run_until(pyda_process *proc, void *test_pc) {
     // Unlocked for performance.
-    for (int i=0; i<proc->thread_run_untils.capacity; i++) {
+    for (int i=0; i<proc->thread_run_untils.entries; i++) {
         if (test_pc == proc->thread_run_untils.array[i]) return 1;
     }
     return 0;
 }
 
 static void thread_prepare_for_python_entry(PyGILState_STATE *gstate, pyda_thread *t, void* pc) {
-    if (gstate) *gstate = PyGILState_Ensure();
+    if (gstate) {
+        // HACK: This is not allowed per the docs. We get away with this
+        // because we check later to see if any flushes occurred during this period
+        t->flush_ts = t->proc->flush_count;
+
+        dr_mark_safe_to_suspend(dr_get_current_drcontext(), true);
+        *gstate = PyGILState_Ensure();
+        dr_mark_safe_to_suspend(dr_get_current_drcontext(), false);
+    }
 
     void *drcontext = dr_get_current_drcontext();
     t->cur_context.size = sizeof(dr_mcontext_t);
@@ -512,7 +538,7 @@ static void thread_prepare_for_python_return(PyGILState_STATE *gstate, pyda_thre
             t->errored = 1;
         }
         dr_set_mcontext(drcontext, &t->cur_context);
-        if (t->proc->dirty_hooks) {
+        if (t->hook_update_queue.entries > 0) {
             dr_fprintf(STDERR, "\n[Pyda] WARN: Hooks should not be modified in a syscall. This is UB, continuing.\n");
         }
         if (gstate) PyGILState_Release(*gstate);
@@ -550,24 +576,24 @@ void pyda_hook_cleancall(pyda_hook *cb) {
 
     if (t->errored) return;
 
+    DEBUG_PRINTF("cleancall %p %p %p tid=%d\n", cb->addr, cb->py_func, t, dr_get_thread_id(dr_get_current_drcontext()));
     thread_prepare_for_python_entry(&gstate, t, cb->addr);
-
-    DEBUG_PRINTF("cleancall %p %p %p\n", cb, cb->py_func, t);
+    DEBUG_PRINTF("cleancall LOCKED %p %p %p\n", cb->addr, cb->py_func, t);
 
     PyObject *result = PyObject_CallFunctionObjArgs(cb->py_func, t->proc->py_obj, NULL);
 
     if (result == NULL) {
         dr_fprintf(STDERR, "\n[Pyda] ERROR: Hook call failed. Skipping future hooks on thread %d\n", t->tid);
-        if (getenv("PYDA_ABORT_ON_ERROR") && getenv("PYDA_ABORT_ON_ERROR")[0] == '1') {
-            dr_fprintf(STDERR, "\n[Pyda] ABORTING (will crash now)\n");
-            *(int*)(1) = 1;
-        }
 
         dr_flush_file(STDERR);
         t->errored = 1;
         PyErr_Print();
         dr_fprintf(STDERR, "\n");
         // dr_abort();
+        if (getenv("PYDA_ABORT_ON_ERROR") && getenv("PYDA_ABORT_ON_ERROR")[0] == '1') {
+            dr_fprintf(STDERR, "\n[Pyda] ABORTING (will crash now)\n");
+            *(int*)(1) = 1;
+        }
     } else {
         Py_DECREF(result);
     }
diff --git a/pyda_core/pyda_core.h b/pyda_core/pyda_core.h
index 912d3d4..8a2ebc7 100644
--- a/pyda_core/pyda_core.h
+++ b/pyda_core/pyda_core.h
@@ -29,7 +29,6 @@ struct pyda_hook_s {
 };
 
 struct pyda_process_s {
-    int dirty_hooks;
     int refcount;
 
     pyda_thread *main_thread;
@@ -50,7 +49,7 @@ struct pyda_process_s {
     hashtable_t callbacks;
     drvector_t threads;
     drvector_t thread_run_untils; // vec of pcs
-    drvector_t hook_delete_queue;
+    int flush_count;
 #endif
 
 };
@@ -83,6 +82,13 @@ struct pyda_thread_s {
 #ifdef PYDA_DYNAMORIO_CLIENT
     dr_mcontext_t cur_context;
     drvector_t context_stack;
+
+    // thread-local list of hooks to be flushed; guarantees that changes go into
+    // effect when expected: e.g., when returning from a hook
+    drvector_t hook_update_queue;
+
+    // records the last seen proc->flush_count so that we don't return into a stale fragment
+    int flush_ts; 
 #endif
 };
 
diff --git a/pyda_core/pyda_patch_python.c b/pyda_core/pyda_patch_python.c
index 9fbcb7b..60ca3f4 100644
--- a/pyda_core/pyda_patch_python.c
+++ b/pyda_core/pyda_patch_python.c
@@ -49,4 +49,4 @@ void patch_python() {
     privmod_t *mod = privload_lookup_by_pc_takelock((app_pc)&PyRun_SimpleString);
     privload_relocate_mod_takelock(mod);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/pyda_core/pyda_threads.c b/pyda_core/pyda_threads.c
index 4ad5aca..9e03953 100644
--- a/pyda_core/pyda_threads.c
+++ b/pyda_core/pyda_threads.c
@@ -57,9 +57,7 @@ int pyda_cond_init(pthread_cond_t *condvar, const pthread_condattr_t *attr) {
 }
 int pyda_cond_timedwait(pthread_cond_t *condvar, pthread_mutex_t *mutex, const struct timespec *abstime) {
     // DEBUG_PRINTF("pthread_cond_timedwait %p %p ids %d\n", condvar, mutex, getpid());
-    // dr_set_safe_for_sync(false);
     int result = pthread_cond_timedwait(condvar, mutex, abstime);
-    // dr_set_safe_for_sync(true);
     return result;
 }
 int pyda_cond_signal(pthread_cond_t *condvar) {
@@ -143,7 +141,6 @@ static void client_thread_init(void *arg) {
     void *tls = python_thread_init(NULL);
     ts->start_routine(ts->arg);
     DEBUG_PRINTF("start_routine returned\n");
-    dr_client_thread_set_suspendable(true);
     dr_thread_free(dr_get_current_drcontext(), tls, sizeof(void*) * 130);
     dr_global_free(ts, sizeof(struct thread_start));
 }
@@ -180,7 +177,7 @@ int pyda_attach_mode;
 
 extern const char *our_getenv(const char *name);
 const char *pyda_getenv(const char *name) {
-    if (pyda_attach_mode) {
+    if (pyda_attach_mode || true) {
         // Dynamorio does not have the correct ENV in attach mode.
         DEBUG_PRINTF("getenv2 %s=%s\n", name, getenv(name));
         return getenv(name);
@@ -215,7 +212,7 @@ void parse_proc_environ() {
     while (key < buf + len) {
         char *k = strtok(key, "=");
         char *v = key + strlen(k) + 1;
-        // DEBUG_PRINTF("setenv %s=%s\n", k, v);
+        DEBUG_PRINTF("setenv %s=%s\n", k, v);
         setenv(k, v, 0);
         key = v + strlen(v) + 1;
     }
diff --git a/pyda_core/tool.c b/pyda_core/tool.c
index aa169f4..219f5ee 100644
--- a/pyda_core/tool.c
+++ b/pyda_core/tool.c
@@ -43,8 +43,7 @@ static void event_attach_post(void);
 extern int is_dynamorio_running;
 extern int pyda_attach_mode;
 
-pthread_cond_t python_thread_init1;
-pthread_cond_t python_thread_init2;
+int is_python_init;
 
 int g_pyda_tls_idx;
 int g_pyda_tls_is_python_thread_idx;
@@ -90,8 +89,6 @@ dr_client_main(client_id_t id, int argc, const char *argv[])
     dr_register_fork_init_event(fork_event);
     dr_request_synchronized_exit();
 
-    pthread_cond_init(&python_thread_init1, 0);
-
     g_pyda_tls_idx = drmgr_register_tls_field();
     g_pyda_tls_is_python_thread_idx = drmgr_register_tls_field();
 }
@@ -111,6 +108,9 @@ void thread_init_event(void *drcontext) {
 
         pyda_prepare_io(global_proc);
         t = global_proc->main_thread;
+        if (!getenv("PYTHONPATH")) {
+            parse_proc_environ();
+        }
     } else {
         t = pyda_mk_thread(global_proc);
     }
@@ -139,12 +139,15 @@ void thread_init_event(void *drcontext) {
     if (t == global_proc->main_thread) {
         module_data_t *main_mod = dr_get_main_module();
         t->proc->entrypoint = (void*)main_mod->entry_point;
-    } else {
-        dr_mcontext_t mc;
-        mc.size = sizeof(mc);
-        mc.flags = DR_MC_ALL;
-        dr_get_mcontext(drcontext, &mc);
 
+        if (!getenv("PYDA_NO_ATTACH")) {
+            pyda_attach_mode = 1;
+            // In attach mode, the entrypoint will never be reached,
+            // so we release the lock now
+            DEBUG_PRINTF("PYDA_NO_ATTACH is not set, assuming attach mode\n")
+            pthread_mutex_unlock(&t->mutex);
+        }
+    } else {
         DEBUG_PRINTF("aux thread initial break\n");
         pyda_initial_break(t);
         DEBUG_PRINTF("aux thread initial break end\n");
@@ -374,33 +377,37 @@ static void fork_event(void *drcontext) {
 }
 
 static void event_attach_post() {
-    pyda_attach_mode = 1;
-    parse_proc_environ();
+    if (!pyda_attach_mode) {
+        dr_fprintf(STDERR, "Internal error: PYDA_NO_ATTACH is set but attach callback used\n");
+        dr_abort();
+        return;
+    }
 
     DEBUG_PRINTF("event_attach_post on tid %d\n", dr_get_thread_id(dr_get_current_drcontext()));
 
     pyda_thread *t = pyda_thread_getspecific(g_pyda_tls_idx);
-    DEBUG_PRINTF("[PYDA] New thread %ld\n", t->tid);
+    DEBUG_PRINTF("[PYDA] Main thread (attached) is %ld\n", t->tid);
 
     if (t->proc->main_thread != t) {
         dr_fprintf(STDERR, "[Pyda] ERROR: Dynamorio is not running on the main thread. This is probably a bug.\n");
         dr_abort();
     }
 
-    pyda_initial_break(t);
-    DEBUG_PRINTF("entrypoint flush (attach)");
+    pthread_mutex_lock(&t->mutex); // we intentionally released the mutex based on `pyda_attach_mode`
+    pyda_initial_break(t); // wait for the script to call p.run()
 
     // XXX: Not clear if this is legal to call here. If it is, we should note that we don't
     // have to redirect execution, because we aren't actually in translated code yet!
-    pyda_flush_hooks();
-    DEBUG_PRINTF("entrypoint end (attach)");
+    /* pyda_flush_hooks(); */
+
+    DEBUG_PRINTF("entrypoint end (attach)\n");
 }
 
 static void thread_entrypoint_break() {
     DEBUG_PRINTF("entrypoint (break)\n");
 
     pyda_thread *t = pyda_thread_getspecific(g_pyda_tls_idx);
-    DEBUG_PRINTF("[PYDA] New thread %ld\n", t->tid);
+    DEBUG_PRINTF("[PYDA] Main thread at entrypiont %ld\n", t->tid);
 
     pyda_initial_break(t);
     if (pyda_flush_hooks()) {
@@ -464,7 +471,6 @@ void python_main_thread(void *arg) {
     t->python_exited = 1;
     t->errored = 1;
 
-    // dr_client_thread_set_suspendable(true);
     DEBUG_PRINTF("After script exit, GIL status %d\n", PyGILState_Check());
     PyEval_SaveThread(); // release GIL
 
@@ -499,11 +505,18 @@ void python_aux_thread(void *arg) {
 
     DEBUG_PRINTF("python_aux_thread id %d\n", dr_get_thread_id(drcontext));
 
+    // Wait for the main script to reach the first yield (so there is time to set thread_init_hook in the attach case)
+    pthread_mutex_lock(&t->proc->main_thread->mutex);
+    while (!t->proc->main_thread->yield_count)
+        pthread_cond_wait(&t->proc->main_thread->resume_cond, &t->proc->main_thread->mutex);
+    pthread_mutex_unlock(&t->proc->main_thread->mutex);
+
+    DEBUG_PRINTF("python_aux_thread enter id %d\n", dr_get_thread_id(drcontext));
+
+    // Acquire the GIL so this thread can call the thread entrypoint
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
 
-    DEBUG_PRINTF("python_aux_thread id %d locked\n", dr_get_thread_id(drcontext));
-
     // We just call the thread init hook, if one exists
     if (t->proc->thread_init_hook && !t->errored) {
         DEBUG_PRINTF("Calling thread_init_hook\n");
@@ -517,7 +530,6 @@ void python_aux_thread(void *arg) {
 
     PyGILState_Release(gstate);
 
-    dr_client_thread_set_suspendable(true);
     DEBUG_PRINTF("python_aux_thread 4\n");
 
     t->python_exited = 1;