Index: glibc-2.22/elf/dl-close.c
===================================================================
--- glibc-2.22.orig/elf/dl-close.c
+++ glibc-2.22/elf/dl-close.c
@@ -79,15 +79,20 @@ remove_slotinfo (size_t idx, struct dtv_
 	{
 	  assert (old_map->l_tls_modid == idx);
 
-	  /* Mark the entry as unused. */
-	  listp->slotinfo[idx - disp].gen = GL(dl_tls_generation) + 1;
-	  listp->slotinfo[idx - disp].map = NULL;
+	  /* Mark the entry as unused.  These can be read concurrently.  */
+	  atomic_store_relaxed (&listp->slotinfo[idx - disp].gen,
+				GL(dl_tls_generation) + 1);
+	  atomic_store_relaxed (&listp->slotinfo[idx - disp].map, NULL);
 	}
 
       /* If this is not the last currently used entry no need to look
 	 further.  */
       if (idx != GL(dl_tls_max_dtv_idx))
-	return true;
+	{
+	  /* There is an unused dtv entry in the middle.  */
+	  GL(dl_tls_dtv_gaps) = true;
+	  return true;
+	}
     }
 
   while (idx - disp > (disp == 0 ? 1 + GL(dl_tls_static_nelem) : 0))
@@ -96,8 +101,8 @@ remove_slotinfo (size_t idx, struct dtv_
 
       if (listp->slotinfo[idx - disp].map != NULL)
 	{
-	  /* Found a new last used index.  */
-	  GL(dl_tls_max_dtv_idx) = idx;
+	  /* Found a new last used index.  This can be read concurrently.  */
+	  atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), idx);
 	  return true;
 	}
     }
@@ -533,6 +538,9 @@ _dl_close_worker (struct link_map *map,
   size_t tls_free_end;
   tls_free_start = tls_free_end = NO_TLS_OFFSET;
 
+  /* Protects global and module specitic TLS state.  */
+  __rtld_lock_lock_recursive (GL(dl_load_tls_lock));
+
   /* We modify the list of loaded objects.  */
   __rtld_lock_lock_recursive (GL(dl_load_write_lock));
 
@@ -558,7 +566,9 @@ _dl_close_worker (struct link_map *map,
 					GL(dl_tls_dtv_slotinfo_list), 0,
 					imap->l_init_called))
 		/* All dynamically loaded modules with TLS are unloaded.  */
-		GL(dl_tls_max_dtv_idx) = GL(dl_tls_static_nelem);
+		/* Can be read concurrently.  */
+		atomic_store_relaxed (&GL(dl_tls_max_dtv_idx),
+				      GL(dl_tls_static_nelem));
 
 	      if (imap->l_tls_offset != NO_TLS_OFFSET
 		  && imap->l_tls_offset != FORCED_DYNAMIC_TLS_OFFSET)
@@ -752,13 +762,19 @@ _dl_close_worker (struct link_map *map,
   /* If we removed any object which uses TLS bump the generation counter.  */
   if (any_tls)
     {
-      if (__glibc_unlikely (++GL(dl_tls_generation) == 0))
+      size_t newgen = GL(dl_tls_generation) + 1;
+      if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
+      /* Can be read concurrently.  */
+      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
 
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
     }
 
+  /* TLS is cleaned up for the unloaded modules.  */
+  __rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
+
 #ifdef SHARED
   /* Auditing checkpoint: we have deleted all objects.  */
   if (__glibc_unlikely (do_audit))
Index: glibc-2.22/elf/dl-open.c
===================================================================
--- glibc-2.22.orig/elf/dl-open.c
+++ glibc-2.22/elf/dl-open.c
@@ -53,6 +53,9 @@ struct dl_open_args
   struct link_map *map;
   /* Namespace ID.  */
   Lmid_t nsid;
+  /* Set to true if the end of dl_open_worker_begin was reached.  */
+  bool worker_continue;
+
   /* Original parameters to the program and the current environment.  */
   int argc;
   char **argv;
@@ -182,7 +185,7 @@ _dl_find_dso_for_object (const ElfW(Addr
 rtld_hidden_def (_dl_find_dso_for_object);
 
 static void
-dl_open_worker (void *a)
+dl_open_worker_begin (void *a)
 {
   struct dl_open_args *args = a;
   const char *file = args->file;
@@ -524,9 +527,15 @@ dl_open_worker (void *a)
     }
 
   /* Bump the generation number if necessary.  */
-  if (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0))
-    _dl_fatal_printf (N_("\
+  if (any_tls)
+    {
+      size_t newgen = GL(dl_tls_generation) + 1;
+      if (__glibc_unlikely (newgen == 0))
+	_dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
+      /* Can be read concurrently.  */
+      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+    }
 
   /* We need a second pass for static tls data, because _dl_update_slotinfo
      must not be run while calls to _dl_add_to_slotinfo are still pending.  */
@@ -560,6 +569,61 @@ TLS generation counter wrapped!  Please
   DL_STATIC_INIT (new);
 #endif
 
+  args->worker_continue = true;
+}
+
+static void
+dl_open_worker (void *a)
+{
+  struct dl_open_args *args = a;
+
+  args->worker_continue = false;
+
+  {
+    /* Protects global and module specific TLS state.  */
+    __rtld_lock_lock_recursive (GL(dl_load_tls_lock));
+
+    const char *objname;
+    const char *errstring;
+    bool malloced;
+    int errcode = _dl_catch_error (&objname, &errstring, &malloced,
+				   dl_open_worker_begin, args);
+
+    __rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
+
+    if (__glibc_unlikely (errstring != NULL))
+      {
+	/* Make a local copy of the error string so that we can release the
+	   memory allocated for it.  */
+	size_t len_errstring = strlen (errstring) + 1;
+	char *local_errstring;
+	if (objname == errstring + len_errstring)
+	  {
+	    size_t total_len = len_errstring + strlen (objname) + 1;
+	    local_errstring = alloca (total_len);
+	    memcpy (local_errstring, errstring, total_len);
+	    objname = local_errstring + len_errstring;
+	  }
+	else
+	  {
+	    local_errstring = alloca (len_errstring);
+	    memcpy (local_errstring, errstring, len_errstring);
+	  }
+
+	if (malloced)
+	  free ((char *) errstring);
+
+	/* Reraise the error.  */
+	_dl_signal_error (errcode, objname, NULL, local_errstring);
+      }
+  }
+
+  if (!args->worker_continue)
+    return;
+
+  int mode = args->mode;
+  struct link_map *new = args->map;
+
   /* Run the initializer functions of new objects.  */
   _dl_init (new, args->argc, args->argv, args->env);
 
@@ -661,16 +725,6 @@ no more namespaces available for dlmopen
 	 state if relocation failed, for example.  */
       if (args.map)
 	{
-	  /* Maybe some of the modules which were loaded use TLS.
-	     Since it will be removed in the following _dl_close call
-	     we have to mark the dtv array as having gaps to fill the
-	     holes.  This is a pessimistic assumption which won't hurt
-	     if not true.  There is no need to do this when we are
-	     loading the auditing DSOs since TLS has not yet been set
-	     up.  */
-	  if ((mode & __RTLD_AUDIT) == 0)
-	    GL(dl_tls_dtv_gaps) = true;
-
 	  _dl_close_worker (args.map, true);
 	}
 
Index: glibc-2.22/elf/dl-support.c
===================================================================
--- glibc-2.22.orig/elf/dl-support.c
+++ glibc-2.22/elf/dl-support.c
@@ -215,6 +215,13 @@ __rtld_lock_define_initialized_recursive
    list of loaded objects while an object is added to or removed from
    that list.  */
 __rtld_lock_define_initialized_recursive (, _dl_load_write_lock)
+  /* This lock protects global and module specific TLS related data.
+     E.g. it is held in dlopen and dlclose when GL(dl_tls_generation),
+     GL(dl_tls_max_dtv_idx) or GL(dl_tls_dtv_slotinfo_list) are
+     accessed and when TLS related relocations are processed for a
+     module.  It was introduced to keep pthread_create accessing TLS
+     state that is being set up.  */
+__rtld_lock_define_initialized_recursive (, _dl_load_tls_lock)
 
 
 #ifdef HAVE_AUX_VECTOR
Index: glibc-2.22/elf/dl-tls.c
===================================================================
--- glibc-2.22.orig/elf/dl-tls.c
+++ glibc-2.22/elf/dl-tls.c
@@ -97,7 +97,9 @@ _dl_next_tls_modid (void)
       /* No gaps, allocate a new entry.  */
     nogaps:
 
-      result = ++GL(dl_tls_max_dtv_idx);
+      result = GL(dl_tls_max_dtv_idx) + 1;
+      /* Can be read concurrently.  */
+      atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), result);
     }
 
   return result;
@@ -108,10 +110,7 @@ size_t
 internal_function
 _dl_count_modids (void)
 {
-  /* It is rare that we have gaps; see elf/dl-open.c (_dl_open) where
-     we fail to load a module and unload it leaving a gap.  If we don't
-     have gaps then the number of modids is the current maximum so
-     return that.  */
+  /* The count is the max unless dlclose or failed dlopen created gaps.  */
   if (__glibc_likely (!GL(dl_tls_dtv_gaps)))
     return GL(dl_tls_max_dtv_idx);
 
@@ -315,10 +314,12 @@ allocate_dtv (void *result)
   dtv_t *dtv;
   size_t dtv_length;
 
+  /* Relaxed MO, because the dtv size is later rechecked, not relied on.  */
+  size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
   /* We allocate a few more elements in the dtv than are needed for the
      initial set of modules.  This should avoid in most cases expansions
      of the dtv.  */
-  dtv_length = GL(dl_tls_max_dtv_idx) + DTV_SURPLUS;
+  dtv_length = max_modid + DTV_SURPLUS;
   dtv = calloc (dtv_length + 2, sizeof (dtv_t));
   if (dtv != NULL)
     {
@@ -402,14 +403,11 @@ extern dtv_t _dl_static_dtv[];
 #endif
 
 static dtv_t *
-_dl_resize_dtv (dtv_t *dtv)
+_dl_resize_dtv (dtv_t *dtv, size_t max_modid)
 {
   /* Resize the dtv.  */
   dtv_t *newp;
-  /* Load GL(dl_tls_max_dtv_idx) atomically since it may be written to by
-     other threads concurrently.  */
-  size_t newsize
-    = atomic_load_acquire (&GL(dl_tls_max_dtv_idx)) + DTV_SURPLUS;
+  size_t newsize = max_modid + DTV_SURPLUS;
   size_t oldsize = dtv[-1].counter;
 
   if (dtv == GL(dl_initial_dtv))
@@ -456,11 +454,14 @@ _dl_allocate_tls_init (void *result)
   size_t total = 0;
   size_t maxgen = 0;
 
+  /* Protects global dynamic TLS related state.  */
+  __rtld_lock_lock_recursive (GL(dl_load_tls_lock));
+
   /* Check if the current dtv is big enough.   */
   if (dtv[-1].counter < GL(dl_tls_max_dtv_idx))
     {
       /* Resize the dtv.  */
-      dtv = _dl_resize_dtv (dtv);
+      dtv = _dl_resize_dtv (dtv, GL(dl_tls_max_dtv_idx));
 
       /* Install this new dtv in the thread data structures.  */
       INSTALL_DTV (result, &dtv[-1]);
@@ -524,6 +525,7 @@ _dl_allocate_tls_init (void *result)
       listp = listp->next;
       assert (listp != NULL);
     }
+  __rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
 
   /* The DTV version is up-to-date now.  */
   dtv[0].counter = maxgen;
@@ -644,12 +646,29 @@ _dl_update_slotinfo (unsigned long int r
 
   if (dtv[0].counter < listp->slotinfo[idx].gen)
     {
-      /* The generation counter for the slot is higher than what the
-	 current dtv implements.  We have to update the whole dtv but
-	 only those entries with a generation counter <= the one for
-	 the entry we need.  */
+      /* CONCURRENCY NOTES:
+
+	 Here the dtv needs to be updated to new_gen generation count.
+
+	 This code may be called during TLS access when GL(dl_load_tls_lock)
+	 is not held.  In that case the user code has to synchronize with
+	 dlopen and dlclose calls of relevant modules.  A module m is
+	 relevant if the generation of m <= new_gen and dlclose of m is
+	 synchronized: a memory access here happens after the dlopen and
+	 before the dlclose of relevant modules.  The dtv entries for
+	 relevant modules need to be updated, other entries can be
+	 arbitrary.
+
+	 This e.g. means that the first part of the slotinfo list can be
+	 accessed race free, but the tail may be concurrently extended.
+	 Similarly relevant slotinfo entries can be read race free, but
+	 other entries are racy.  However updating a non-relevant dtv
+	 entry does not affect correctness.  For a relevant module m,
+	 max_modid >= modid of m.  */
       size_t new_gen = listp->slotinfo[idx].gen;
       size_t total = 0;
+      size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
+      assert (max_modid >= req_modid);
 
       /* We have to look through the entire dtv slotinfo list.  */
       listp =  GL(dl_tls_dtv_slotinfo_list);
@@ -657,12 +676,16 @@ _dl_update_slotinfo (unsigned long int r
 	{
 	  for (size_t cnt = total == 0 ? 1 : 0; cnt < listp->len; ++cnt)
 	    {
-	      size_t gen = listp->slotinfo[cnt].gen;
+	      size_t modid = total + cnt;
+
+	      /* Later entries are not relevant.  */
+	      if (modid > max_modid)
+		break;
+
+	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
 
 	      if (gen > new_gen)
-		/* This is a slot for a generation younger than the
-		   one we are handling now.  It might be incompletely
-		   set up so ignore it.  */
+		/* Not relevant.  */
 		continue;
 
 	      /* If the entry is older than the current dtv layout we
@@ -671,31 +694,16 @@ _dl_update_slotinfo (unsigned long int r
 		continue;
 
 	      /* If there is no map this means the entry is empty.  */
-	      struct link_map *map = listp->slotinfo[cnt].map;
-	      if (map == NULL)
-		{
-		  if (dtv[-1].counter >= total + cnt)
-		    {
-		      /* If this modid was used at some point the memory
-			 might still be allocated.  */
-		      if (! dtv[total + cnt].pointer.is_static
-			  && (dtv[total + cnt].pointer.val
-			      != TLS_DTV_UNALLOCATED))
-			free (dtv[total + cnt].pointer.val);
-		      dtv[total + cnt].pointer.val = TLS_DTV_UNALLOCATED;
-		      dtv[total + cnt].pointer.is_static = false;
-		    }
-
-		  continue;
-		}
-
+	      struct link_map *map
+		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
 	      /* Check whether the current dtv array is large enough.  */
-	      size_t modid = map->l_tls_modid;
-	      assert (total + cnt == modid);
 	      if (dtv[-1].counter < modid)
 		{
+		  if (map == NULL)
+		    continue;
+
 		  /* Resize the dtv.  */
-		  dtv = _dl_resize_dtv (dtv);
+		  dtv = _dl_resize_dtv (dtv, max_modid);
 
 		  assert (modid <= dtv[-1].counter);
 
@@ -724,8 +732,17 @@ _dl_update_slotinfo (unsigned long int r
 	    }
 
 	  total += listp->len;
+	  if (total > max_modid)
+	    break;
+
+	  /* Synchronize with _dl_add_to_slotinfo.  Ideally this would
+	     be consume MO since we only need to order the accesses to
+	     the next node after the read of the address and on most
+	     hardware (other than alpha) a normal load would do that
+	     because of the address dependency.  */
+	  listp = atomic_load_acquire (&listp->next);
 	}
-      while ((listp = listp->next) != NULL);
+      while (listp != NULL);
 
       /* This will be the new maximum generation counter.  */
       dtv[0].counter = new_gen;
@@ -762,11 +779,11 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
   if (__glibc_unlikely (the_map->l_tls_offset
 			!= FORCED_DYNAMIC_TLS_OFFSET))
     {
-      __rtld_lock_lock_recursive (GL(dl_load_lock));
+      __rtld_lock_lock_recursive (GL(dl_load_tls_lock));
       if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
 	{
 	  the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET;
-	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+	  __rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
 	}
       else if (__glibc_likely (the_map->l_tls_offset
 			       != FORCED_DYNAMIC_TLS_OFFSET))
@@ -778,7 +795,7 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
 #else
 # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
 #endif
-	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+	  __rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
 
 	  dtv[GET_ADDR_MODULE].pointer.is_static = true;
 	  dtv[GET_ADDR_MODULE].pointer.val = p;
@@ -786,7 +803,7 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
 	  return (char *) p + GET_ADDR_OFFSET;
 	}
       else
-	__rtld_lock_unlock_recursive (GL(dl_load_lock));
+	__rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
     }
   void *p = dtv[GET_ADDR_MODULE].pointer.val = allocate_and_init (the_map);
   assert (!dtv[GET_ADDR_MODULE].pointer.is_static);
@@ -828,7 +845,12 @@ __tls_get_addr (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 
-  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+  /* Update is needed if dtv[0].counter < the generation of the accessed
+     module.  The global generation counter is used here as it is easier
+     to check.  Synchronization for the relaxed MO access is guaranteed
+     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
+  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  if (__glibc_unlikely (dtv[0].counter != gen))
     return update_get_addr (GET_ADDR_PARAM);
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -851,7 +873,10 @@ _dl_tls_get_addr_soft (struct link_map *
     return NULL;
 
   dtv_t *dtv = THREAD_DTV ();
-  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+  /* This may be called without holding the GL(dl_load_tls_lock).  Reading
+     arbitrary gen value is fine since this is best effort code.  */
+  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  if (__glibc_unlikely (dtv[0].counter != gen))
     {
       /* This thread's DTV is not completely current,
 	 but it might already cover this module.  */
@@ -916,7 +941,7 @@ _dl_add_to_slotinfo (struct link_map *l)
 	 the first slot.  */
       assert (idx == 0);
 
-      listp = prevp->next = (struct dtv_slotinfo_list *)
+      listp = (struct dtv_slotinfo_list *)
 	malloc (sizeof (struct dtv_slotinfo_list)
 		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
       if (listp == NULL)
@@ -939,9 +964,11 @@ cannot create TLS data structures"));
       listp->next = NULL;
       memset (listp->slotinfo, '\0',
 	      TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
+      /* Synchronize with _dl_update_slotinfo.  */
+      atomic_store_release (&prevp->next, listp);
     }
 
   /* Add the information into the slotinfo data structure.  */
-  listp->slotinfo[idx].map = l;
-  listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
+  atomic_store_relaxed (&listp->slotinfo[idx].map, l);
+  atomic_store_relaxed (&listp->slotinfo[idx].gen, GL(dl_tls_generation) + 1);
 }
Index: glibc-2.22/elf/rtld.c
===================================================================
--- glibc-2.22.orig/elf/rtld.c
+++ glibc-2.22/elf/rtld.c
@@ -129,6 +129,7 @@ struct rtld_global _rtld_global =
 #ifdef _LIBC_REENTRANT
     ._dl_load_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER,
     ._dl_load_write_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER,
+    ._dl_load_tls_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER,
 #endif
     ._dl_nns = 1,
     ._dl_ns =
Index: glibc-2.22/sysdeps/generic/ldsodefs.h
===================================================================
--- glibc-2.22.orig/sysdeps/generic/ldsodefs.h
+++ glibc-2.22/sysdeps/generic/ldsodefs.h
@@ -322,6 +322,13 @@ struct rtld_global
      list of loaded objects while an object is added to or removed
      from that list.  */
   __rtld_lock_define_recursive (EXTERN, _dl_load_write_lock)
+  /* This lock protects global and module specific TLS related data.
+     E.g. it is held in dlopen and dlclose when GL(dl_tls_generation),
+     GL(dl_tls_max_dtv_idx) or GL(dl_tls_dtv_slotinfo_list) are
+     accessed and when TLS related relocations are processed for a
+     module.  It was introduced to keep pthread_create accessing TLS
+     state that is being set up.  */
+  __rtld_lock_define_recursive (EXTERN, _dl_load_tls_lock)
 
   /* Incremented whenever something may have been added to dl_loaded.  */
   EXTERN unsigned long long _dl_load_adds;
Index: glibc-2.22/sysdeps/nptl/fork.c
===================================================================
--- glibc-2.22.orig/sysdeps/nptl/fork.c
+++ glibc-2.22/sysdeps/nptl/fork.c
@@ -185,6 +185,9 @@ __libc_fork (void)
       /* Reset the lock the dynamic loader uses to protect its data.  */
       __rtld_lock_initialize (GL(dl_load_lock));
 
+      /* Reset the lock protecting dynamic TLS related data.  */
+      __rtld_lock_initialize (GL(dl_load_tls_lock));
+
       /* Run the handlers registered for the child.  */
       while (allp != NULL)
 	{
