From: Erez Zadok <ezk@cs.sunysb.edu>
Date: Fri, 29 Jul 2005 10:47:19 +0000 (+0000)
Subject: * amd/srvr_nfs.c (find_nfs_srvr): don't blindly copy the hostent
X-Git-Tag: am-utils-6_1_1~10
X-Git-Url: https://git.fsl.cs.stonybrook.edu/?a=commitdiff_plain;h=700b36ccbf0e5def1bb617ce574881bcfe876d37;p=am-utils-6.1.git

* amd/srvr_nfs.c (find_nfs_srvr): don't blindly copy the hostent
IP address.  First check if it differs from the existing one of
the fserver, and copy only if it changed.  If it did change, flush
the fhandle cache to avoid a stale fhandle being reused.  This
allows Amd to detect IP address changes even for an fserver that
lost one or more NFS pings, but not enough to be declared totally
down.  We handle the "totally down" fserver case in
check_fs_addr_change().

* amd/ops_nfs.c (nfs_umount), amd/sched.c (sched_task),
amd/nfs_subr.c (nfs_quick_reply): code clarity.

* conf/mount/mount_linux.c (linux_nfs_error): dlog mappings of
errnos to NFS errors.

* conf/umount/umount_linux.c (umount2_fs): cleanup code.  Trying
stat() seems doomed to hang at times, so don't try stat at all,
because umount2() appears to be clever enough to never hang.
---

diff --git a/ChangeLog b/ChangeLog
index 3d5c681..2e893e7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2005-07-29  Erez Zadok  <ezk@cs.sunysb.edu>
+
+	* amd/srvr_nfs.c (find_nfs_srvr): don't blindly copy the hostent
+	IP address.  First check if it differs from the existing one of
+	the fserver, and copy only if it changed.  If it did change, flush
+	the fhandle cache to avoid a stale fhandle being reused.  This
+	allows Amd to detect IP address changes even for an fserver that
+	lost one or more NFS pings, but not enough to be declared totally
+	down.  We handle the "totally down" fserver case in
+	check_fs_addr_change().
+
+	* amd/ops_nfs.c (nfs_umount), amd/sched.c (sched_task),
+	amd/nfs_subr.c (nfs_quick_reply): code clarity.
+
+	* conf/mount/mount_linux.c (linux_nfs_error): dlog mappings of
+	errnos to NFS errors.
+
+	* conf/umount/umount_linux.c (umount2_fs): cleanup code.  Trying
+	stat() seems doomed to hang at times, so don't try stat at all,
+	because umount2() appears to be clever enough to never hang.
+
 2005-07-25  Erez Zadok  <ezk@cs.sunysb.edu>
 
 	* amd/amd.h (FSF_FORCE_UNMOUNT): new flag used to indicate that a
diff --git a/amd/nfs_subr.c b/amd/nfs_subr.c
index 6f5502b..fb60ab0 100644
--- a/amd/nfs_subr.c
+++ b/amd/nfs_subr.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: nfs_subr.c,v 1.33 2005/05/18 18:12:31 ezk Exp $
+ * $Id: nfs_subr.c,v 1.34 2005/07/29 10:47:19 ezk Exp $
  *
  */
 
@@ -283,8 +283,7 @@ nfs_quick_reply(am_node *mp, int error)
     /*
      * Free up transp.  It's only used for one reply.
      */
-    XFREE(transp);
-    mp->am_transp = NULL;
+    XFREE(mp->am_transp);
     dlog("Quick reply sent for %s", mp->am_mnt->mf_mount);
   }
 }
diff --git a/amd/ops_nfs.c b/amd/ops_nfs.c
index e81e3af..c688e21 100644
--- a/amd/ops_nfs.c
+++ b/amd/ops_nfs.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: ops_nfs.c,v 1.45 2005/07/26 03:31:08 ezk Exp $
+ * $Id: ops_nfs.c,v 1.46 2005/07/29 10:47:19 ezk Exp $
  *
  */
 
@@ -925,9 +925,10 @@ nfs_mount(am_node *am, mntfs *mf)
 static int
 nfs_umount(am_node *am, mntfs *mf)
 {
-  int unmount_flags = (mf->mf_flags & MFF_ON_AUTOFS) ? AMU_UMOUNT_AUTOFS : 0;
-  int new_unmount_flags;
-  int error = UMOUNT_FS(mf->mf_mount, mnttab_file_name, unmount_flags);
+  int unmount_flags, new_unmount_flags, error;
+
+  unmount_flags = (mf->mf_flags & MFF_ON_AUTOFS) ? AMU_UMOUNT_AUTOFS : 0;
+  error = UMOUNT_FS(mf->mf_mount, mnttab_file_name, unmount_flags);
 
 #if defined(HAVE_UMOUNT2) && (defined(MNT2_GEN_OPT_FORCE) || defined(MNT2_GEN_OPT_DETACH))
   /*
@@ -938,7 +939,12 @@ nfs_umount(am_node *am, mntfs *mf)
       gopt.flags & CFM_FORCED_UNMOUNTS &&
       mf->mf_server->fs_flags & FSF_FORCE_UNMOUNT) {
     plog(XLOG_INFO, "EZK: nfs_umount: trying forced/lazy unmounts");
-    mf->mf_server->fs_flags &= ~FSF_FORCE_UNMOUNT; /* XXX: incorrect */
+    /*
+     * XXX: turning off the FSF_FORCE_UNMOUNT may not be perfectly
+     * incorrect.  Multiple nodes may need to be timed out and restarted for
+     * a single hung fserver.
+     */
+    mf->mf_server->fs_flags &= ~FSF_FORCE_UNMOUNT;
     new_unmount_flags = unmount_flags | AMU_UMOUNT_FORCE | AMU_UMOUNT_DETACH;
     error = UMOUNT_FS(mf->mf_mount, mnttab_file_name, new_unmount_flags);
   }
diff --git a/amd/sched.c b/amd/sched.c
index b0ac928..7e42f71 100644
--- a/amd/sched.c
+++ b/amd/sched.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: sched.c,v 1.17 2005/01/03 20:56:45 ezk Exp $
+ * $Id: sched.c,v 1.18 2005/07/29 10:47:19 ezk Exp $
  *
  */
 
@@ -164,7 +164,7 @@ sched_task(cb_fun *cf, opaque_t ca, wchan_t wchan)
   dlog("SLEEP on %p", wchan);
   p->wchan = wchan;
   p->pid = 0;
-  memset((voidp) &p->w, 0, sizeof(p->w));
+  p->w = 0;			/* was memset (when ->w was union) */
 }
 
 
diff --git a/amd/srvr_nfs.c b/amd/srvr_nfs.c
index 9b5f95c..f9d683a 100644
--- a/amd/srvr_nfs.c
+++ b/amd/srvr_nfs.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: srvr_nfs.c,v 1.42 2005/07/26 03:31:08 ezk Exp $
+ * $Id: srvr_nfs.c,v 1.43 2005/07/29 10:47:19 ezk Exp $
  *
  */
 
@@ -444,8 +444,6 @@ check_fs_addr_change(fserver *fs)
   struct hostent *hp = NULL;
   struct in_addr ia;
   char *old_ipaddr, *new_ipaddr;
-  //  nfs_private *np = (nfs_private *) fs->fs_private;
-  EZKDBG;
 
   hp = gethostbyname(fs->fs_host);
   if (!hp ||
@@ -466,7 +464,7 @@ check_fs_addr_change(fserver *fs)
   memmove((voidp) &fs->fs_ip->sin_addr,
 	  (voidp) hp->h_addr,
 	  sizeof(fs->fs_ip->sin_addr));
-  /* XXX: are any of these correct?! */
+  /* XXX: do we need to un/set these flags? */
   fs->fs_flags &= ~FSF_DOWN;
   fs->fs_flags |= FSF_VALID | FSF_WANT;
   map_flush_srvr(fs);		/* XXX: a race with flush_srvr_nfs_cache? */
@@ -475,10 +473,11 @@ check_fs_addr_change(fserver *fs)
 
 #if 0
   flush_nfs_fhandle_cache(fs);	/* done in caller: nfs_keepalive_timeout */
-  // XXX: need to purge nfs_private so that somehow it will get re-initialized
+  /* XXX: need to purge nfs_private so that somehow it will get re-initialized? */
 #endif
 }
 
+
 /*
  * Called when no ping-reply received
  */
@@ -520,7 +519,7 @@ nfs_keepalive_timeout(voidp v)
        */
       flush_nfs_fhandle_cache(fs);
       np->np_error = -1;
-      check_fs_addr_change(fs);	/* check if IP addr of fserver changed */
+      check_fs_addr_change(fs); /* check if IP addr of fserver changed */
     } else {
       /*
        * Known to be down
@@ -929,8 +928,21 @@ no_dns:
        * between mounts.
        * Mike Mitchell, mcm@unx.sas.com, 09/08/93
        */
-      if (hp && fs->fs_ip)
+      if (hp && fs->fs_ip &&
+	  memcmp((voidp) &fs->fs_ip->sin_addr,
+		 (voidp) hp->h_addr,
+		 sizeof(fs->fs_ip->sin_addr)) != 0) {
+	struct in_addr ia;
+	char *old_ipaddr, *new_ipaddr;
+	old_ipaddr = strdup(inet_ntoa(fs->fs_ip->sin_addr));
+	memmove((voidp) &ia, (voidp) hp->h_addr, sizeof(struct in_addr));
+	new_ipaddr = inet_ntoa(ia);	/* ntoa uses static buf */
+	plog(XLOG_WARNING, "fileserver %s changed ip: %s -> %s",
+	     fs->fs_host, old_ipaddr, new_ipaddr);
+	XFREE(old_ipaddr);
+	flush_nfs_fhandle_cache(fs);
 	memmove((voidp) &fs->fs_ip->sin_addr, (voidp) hp->h_addr, sizeof(fs->fs_ip->sin_addr));
+      }
 
       /*
        * If the new file systems doesn't use WebNFS, the nfs pings may
diff --git a/conf/mount/mount_linux.c b/conf/mount/mount_linux.c
index 61c0c23..cf85723 100644
--- a/conf/mount/mount_linux.c
+++ b/conf/mount/mount_linux.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: mount_linux.c,v 1.43 2005/07/09 19:41:06 ezk Exp $
+ * $Id: mount_linux.c,v 1.44 2005/07/29 10:47:19 ezk Exp $
  */
 
 /*
@@ -671,10 +671,14 @@ static int nfs_errormap[] = {
 int
 linux_nfs_error(int e)
 {
+  int ret = (nfsstat) NE_IO;
+
   if (e < NFS_LOMAP || e > NFS_HIMAP)
-    return (nfsstat)NE_IO;
-  e = nfs_errormap[e - NFS_LOMAP];
-  return (nfsstat)e;
+    ret = (nfsstat) NE_IO;
+  else
+    ret = nfs_errormap[e - NFS_LOMAP];
+  dlog("linux_nfs_error: map error %d to NFS error %d", e, ret);
+  return (nfsstat) ret;
 }
 
 
diff --git a/conf/umount/umount_linux.c b/conf/umount/umount_linux.c
index 6ab85df..b02666a 100644
--- a/conf/umount/umount_linux.c
+++ b/conf/umount/umount_linux.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: umount_linux.c,v 1.8 2005/07/25 23:49:41 ezk Exp $
+ * $Id: umount_linux.c,v 1.9 2005/07/29 10:47:19 ezk Exp $
  *
  */
 
@@ -213,15 +213,31 @@ umount_fs(char *mntdir, const char *mnttabname, u_int unmount_flags)
 
 #if defined(HAVE_UMOUNT2) && (defined(MNT2_GEN_OPT_FORCE) || defined(MNT2_GEN_OPT_DETACH))
 /*
- * Force unmount, no questions asked, without touching mnttab file.
- * The order here is relevant because we may want to try the "safer" detach
- * unmount before trying the more drastic "forced" unmount.
+ * Force unmount, no questions asked, without touching mnttab file.  Try
+ * detach first because it is safer: will remove the hung mnt point without
+ * affecting hung applications.  "Force" is more risky: it will cause the
+ * kernel to return EIO to applications stuck on a stat(2) of Amd.
  */
 int
 umount2_fs(const char *mntdir, u_int unmount_flags)
 {
   int error = 0;
 
+#ifdef MNT2_GEN_OPT_DETACH
+  if (unmount_flags & AMU_UMOUNT_DETACH) {
+    error = umount2(mntdir, MNT2_GEN_OPT_DETACH);
+    if (error < 0 && (errno == EINVAL || errno == ENOENT))
+      error = 0;		/* ignore EINVAL/ENOENT */
+    if (error < 0) {		/* don't try FORCE if detach succeeded */
+      plog(XLOG_WARNING, "%s: unmount/detach: %m", mntdir);
+      /* fall through to try "force" (if flag specified) */
+    } else {
+      dlog("%s: unmount/detach: OK", mntdir);
+      return error;
+    }
+  }
+#endif /* MNT2_GEN_OPT_DETACH */
+
 #ifdef MNT2_GEN_OPT_FORCE
   if (unmount_flags & AMU_UMOUNT_FORCE) {
     plog(XLOG_INFO, "umount2_fs: trying unmount/forced on %s", mntdir);
@@ -230,56 +246,12 @@ umount2_fs(const char *mntdir, u_int unmount_flags)
       error = 0;		/* ignore EINVAL/ENOENT */
     if (error < 0)
       plog(XLOG_WARNING, "%s: unmount/force: %m", mntdir);
-    else {
+    else
       dlog("%s: unmount/force: OK", mntdir);
-      goto out;
-    }
+    /* fall through to return whatever error we got (if any) */
   }
 #endif /* MNT2_GEN_OPT_FORCE */
 
-#ifdef MNT2_GEN_OPT_DETACH
-  /*
-   * XXX: the stat() below may hang this unmount attempt of a toplvl
-   * mount.  In that case, you may have to kill -9 the Amd process.  A
-   * better way to handle this would be to check mtab for an old amd
-   * process, send a kill -0 to it to see if the Amd process is alive, and
-   * only do the forced unmount if the older Amd process died.
-   */
-  if (unmount_flags & AMU_UMOUNT_DETACH) {
-    /*
-     * If I got an EBUSY from the above FORCE, then don't try to stat(), or
-     * it will hang.
-     */
-    if (error < 0 && errno == EBUSY) {
-      error = 0;
-    } else {
-      struct stat dummy;
-      dlog("umount_fs: try stat() before unmount/detach");
-      error = stat(mntdir, &dummy);
-    }
-    if (!error || (errno == ESTALE || errno == EIO)) {
-      if (error < 0)
-	plog(XLOG_INFO, "unmount2_fs: trying unmount/detach of %s (%m)",
-	     mntdir);
-      else
-	plog(XLOG_INFO, "unmount2_fs: trying unmount/detach of %s",
-	     mntdir);
-      error = umount2(mntdir, MNT2_GEN_OPT_DETACH);
-      if (error < 0 && (errno == EINVAL || errno == ENOENT))
-	error = 0;		/* ignore EINVAL/ENOENT */
-      if (error < 0)		/* don't try FORCE if detach succeeded */
-	plog(XLOG_WARNING, "%s: unmount/detach: %m", mntdir);
-      else {
-	dlog("%s: unmount/detach: OK", mntdir);
-	goto out;		/* superfluous (but symmetric code :-) */
-      }
-    }
-  }
-#endif /* MNT2_GEN_OPT_DETACH */
-
-#ifdef MNT2_GEN_OPT_DETACH
- out:
-#endif /* MNT2_GEN_OPT_DETACH */
   return error;
 }
 #endif /* HAVE_UMOUNT2 && (MNT2_GEN_OPT_FORCE || MNT2_GEN_OPT_DETACH) */