View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import com.google.common.annotations.VisibleForTesting;
21  import com.google.common.base.Stopwatch;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.hbase.classification.InterfaceAudience;
26  import org.apache.hadoop.conf.Configuration;
27  import org.apache.hadoop.hbase.Abortable;
28  import org.apache.hadoop.hbase.HRegionInfo;
29  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
30  import org.apache.hadoop.hbase.ServerName;
31  import org.apache.hadoop.hbase.client.HConnection;
32  import org.apache.hadoop.hbase.client.HConnectionManager;
33  import org.apache.hadoop.hbase.client.HTable;
34  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
35  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
36  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
37  import org.apache.hadoop.hbase.master.RegionState;
38  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
39  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
40  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
41  import org.apache.hadoop.hbase.util.Bytes;
42  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
43  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
44  import org.apache.hadoop.ipc.RemoteException;
45  
46  import java.io.EOFException;
47  import java.io.IOException;
48  import java.net.ConnectException;
49  import java.net.NoRouteToHostException;
50  import java.net.SocketException;
51  import java.net.SocketTimeoutException;
52  import java.net.UnknownHostException;
53  
54  /**
55   * Tracks the availability of the catalog tables
56   * <code>hbase:meta</code>.
57   *
58   * This class is "read-only" in that the locations of the catalog tables cannot
59   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
60   * and location of <code>hbase:meta</code>.
61   *
62   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
63   * interrupt waits and close up shop.
64   */
65  @InterfaceAudience.Private
66  public class CatalogTracker {
67    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
68    // TODO: This class needs a rethink.  The original intent was that it would be
69    // the one-stop-shop for meta locations and that it would get this
70    // info from reading and watching zk state.  The class was to be used by
71    // servers when they needed to know of meta movement but also by
72    // client-side (inside in HTable) so rather than figure meta
73    // locations on fault, the client would instead get notifications out of zk.
74    //
75    // But this original intent is frustrated by the fact that this class has to
76    // read an hbase table, the -ROOT- table, to figure out the hbase:meta region
77    // location which means we depend on an HConnection.  HConnection will do
78    // retrying but also, it has its own mechanism for finding root and meta
79    // locations (and for 'verifying'; it tries the location and if it fails, does
80    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
81    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
82    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
83    // where we'd open a connection to zk, read what we need then let the
84    // connection go?).  The 'fix' is make it so both root and meta addresses
85    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
86    //
87    // But even then, this class does 'verification' of the location and it does
88    // this by making a call over an HConnection (which will do its own root
89    // and meta lookups).  Isn't this verification 'useless' since when we
90    // return, whatever is dependent on the result of this call then needs to
91    // use HConnection; what we have verified may change in meantime (HConnection
92    // uses the CT primitives, the root and meta trackers finding root locations).
93    //
94    // When meta is moved to zk, this class may make more sense.  In the
95    // meantime, it does not cohere.  It should just watch meta and root and not
96    // NOT do verification -- let that be out in HConnection since its going to
97    // be done there ultimately anyways.
98    //
99    // This class has spread throughout the codebase.  It needs to be reigned in.
100   // This class should be used server-side only, even if we move meta location
101   // up into zk.  Currently its used over in the client package. Its used in
102   // MetaReader and MetaEditor classes usually just to get the Configuration
103   // its using (It does this indirectly by asking its HConnection for its
104   // Configuration and even then this is just used to get an HConnection out on
105   // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
106   // doing CT fixup. St.Ack 09/30/2011.
107   //
108 
109   // TODO: Timeouts have never been as advertised in here and its worse now
110   // with retries; i.e. the HConnection retries and pause goes ahead whatever
111   // the passed timeout is.  Fix.
112   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
113   private final HConnection connection;
114   private final ZooKeeperWatcher zookeeper;
115   private final MetaRegionTracker metaRegionTracker;
116   private boolean instantiatedzkw = false;
117   private Abortable abortable;
118 
119   private volatile boolean stopped = false;
120 
121   static final byte [] META_REGION_NAME =
122     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
123 
124   /**
125    * Constructs a catalog tracker. Find current state of catalog tables.
126    * Begin active tracking by executing {@link #start()} post construction. Does
127    * not timeout.
128    *
129    * @param conf
130    *          the {@link Configuration} from which a {@link HConnection} will be
131    *          obtained; if problem, this connections
132    *          {@link HConnection#abort(String, Throwable)} will be called.
133    * @throws IOException
134    */
135   public CatalogTracker(final Configuration conf) throws IOException {
136     this(null, conf, null);
137   }
138 
139   /**
140    * Constructs the catalog tracker.  Find current state of catalog tables.
141    * Begin active tracking by executing {@link #start()} post construction.
142    * Does not timeout.
143    * @param zk If zk is null, we'll create an instance (and shut it down
144    * when {@link #stop()} is called) else we'll use what is passed.
145    * @param conf
146    * @param abortable If fatal exception we'll call abort on this.  May be null.
147    * If it is we'll use the Connection associated with the passed
148    * {@link Configuration} as our Abortable.
149    * @throws IOException
150    */
151   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
152       Abortable abortable)
153   throws IOException {
154     this(zk, conf, HConnectionManager.getConnection(conf), abortable);
155   }
156 
157   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
158       HConnection connection, Abortable abortable)
159   throws IOException {
160     this.connection = connection;
161     if (abortable == null) {
162       // A connection is abortable.
163       this.abortable = this.connection;
164     }
165     Abortable throwableAborter = new Abortable() {
166 
167       @Override
168       public void abort(String why, Throwable e) {
169         throw new RuntimeException(why, e);
170       }
171 
172       @Override
173       public boolean isAborted() {
174         return true;
175       }
176 
177     };
178     if (zk == null) {
179       // Create our own.  Set flag so we tear it down on stop.
180       this.zookeeper =
181         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
182           abortable);
183       instantiatedzkw = true;
184     } else {
185       this.zookeeper = zk;
186     }
187     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
188   }
189 
190   /**
191    * Starts the catalog tracker.
192    * Determines current availability of catalog tables and ensures all further
193    * transitions of either region are tracked.
194    * @throws IOException
195    * @throws InterruptedException
196    */
197   public void start() throws IOException, InterruptedException {
198     LOG.debug("Starting catalog tracker " + this);
199     try {
200       this.metaRegionTracker.start();
201     } catch (RuntimeException e) {
202       Throwable t = e.getCause();
203       this.abortable.abort(e.getMessage(), t);
204       throw new IOException("Attempt to start meta tracker failed.", t);
205     }
206   }
207 
208   /**
209    * @return True if we are stopped. Call only after start else indeterminate answer.
210    */
211   @VisibleForTesting
212   public boolean isStopped() {
213     return this.stopped;
214   }
215 
216   /**
217    * Stop working.
218    * Interrupts any ongoing waits.
219    */
220   public void stop() {
221     if (!this.stopped) {
222       LOG.debug("Stopping catalog tracker " + this);
223       this.stopped = true;
224       this.metaRegionTracker.stop();
225       try {
226         if (this.connection != null) {
227           this.connection.close();
228         }
229       } catch (IOException e) {
230         // Although the {@link Closeable} interface throws an {@link
231         // IOException}, in reality, the implementation would never do that.
232         LOG.error("Attempt to close catalog tracker's connection failed.", e);
233       }
234       if (this.instantiatedzkw) {
235         this.zookeeper.close();
236       }
237     }
238   }
239 
240   /**
241    * Gets the current location for <code>hbase:meta</code> or null if location is
242    * not currently available.
243    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
244    * if none available
245    * @throws InterruptedException
246    */
247   public ServerName getMetaLocation() throws InterruptedException {
248     return this.metaRegionTracker.getMetaRegionLocation();
249   }
250 
251   /**
252    * Checks whether meta regionserver znode has some non null data.
253    * @return true if data is not null, false otherwise.
254    */
255   public boolean isMetaLocationAvailable() {
256     return this.metaRegionTracker.isLocationAvailable();
257   }
258   /**
259    * Gets the current location for <code>hbase:meta</code> if available and waits
260    * for up to the specified timeout if not immediately available.  Returns null
261    * if the timeout elapses before root is available.
262    * @param timeout maximum time to wait for root availability, in milliseconds
263    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
264    * if none available
265    * @throws InterruptedException if interrupted while waiting
266    * @throws NotAllMetaRegionsOnlineException if meta not available before
267    * timeout
268    */
269   public ServerName waitForMeta(final long timeout)
270   throws InterruptedException, NotAllMetaRegionsOnlineException {
271     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
272     if (sn == null) {
273       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
274     }
275     return sn;
276   }
277   
278   /**
279    * Get meta region state
280    * @return RegionState
281    */
282   public RegionState getMetaRegionState() {
283     return metaRegionTracker.getMetaRegionState();
284   }
285   
286   
287 
288   /**
289    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
290    * waiting up to the specified timeout for availability.
291    * @param timeout How long to wait on meta location
292    * @see #waitForMeta for additional information
293    * @return connection to server hosting meta
294    * @throws InterruptedException
295    * @throws NotAllMetaRegionsOnlineException if timed out waiting
296    * @throws IOException
297    * @deprecated Use #getMetaServerConnection(long)
298    */
299   public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
300   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
301     return getMetaServerConnection(timeout);
302   }
303 
304   /**
305    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
306    * waiting up to the specified timeout for availability.
307    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
308    * @param timeout How long to wait on meta location
309    * @see #waitForMeta for additional information
310    * @return connection to server hosting meta
311    * @throws InterruptedException
312    * @throws NotAllMetaRegionsOnlineException if timed out waiting
313    * @throws IOException
314    */
315   AdminService.BlockingInterface getMetaServerConnection(long timeout)
316   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
317     return getCachedConnection(waitForMeta(timeout));
318   }
319 
320   /**
321    * Waits indefinitely for availability of <code>hbase:meta</code>.  Used during
322    * cluster startup.  Does not verify meta, just that something has been
323    * set up in zk.
324    * @see #waitForMeta(long)
325    * @throws InterruptedException if interrupted while waiting
326    */
327   public void waitForMeta() throws InterruptedException {
328     Stopwatch stopwatch = new Stopwatch().start();
329     while (!this.stopped) {
330       try {
331         if (waitForMeta(100) != null) break;
332         long sleepTime = stopwatch.elapsedMillis();
333         // +1 in case sleepTime=0
334         if ((sleepTime + 1) % 10000 == 0) {
335           LOG.warn("Have been waiting for meta to be assigned for " + sleepTime + "ms");
336         }
337       } catch (NotAllMetaRegionsOnlineException e) {
338         if (LOG.isTraceEnabled()) {
339           LOG.trace("hbase:meta still not available, sleeping and retrying." +
340           " Reason: " + e.getMessage());
341         }
342       }
343     }
344   }
345 
346   /**
347    * @param sn ServerName to get a connection against.
348    * @return The AdminProtocol we got when we connected to <code>sn</code>
349    * May have come from cache, may not be good, may have been setup by this
350    * invocation, or may be null.
351    * @throws IOException
352    */
353   private AdminService.BlockingInterface getCachedConnection(ServerName sn)
354   throws IOException {
355     if (sn == null) {
356       return null;
357     }
358     AdminService.BlockingInterface service = null;
359     try {
360       service = connection.getAdmin(sn);
361     } catch (RetriesExhaustedException e) {
362       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
363         // Catch this; presume it means the cached connection has gone bad.
364       } else {
365         throw e;
366       }
367     } catch (SocketTimeoutException e) {
368       LOG.debug("Timed out connecting to " + sn);
369     } catch (NoRouteToHostException e) {
370       LOG.debug("Connecting to " + sn, e);
371     } catch (SocketException e) {
372       LOG.debug("Exception connecting to " + sn);
373     } catch (UnknownHostException e) {
374       LOG.debug("Unknown host exception connecting to  " + sn);
375     } catch (FailedServerException e) {
376       if (LOG.isDebugEnabled()) {
377         LOG.debug("Server " + sn + " is in failed server list.");
378       }
379     } catch (IOException ioe) {
380       Throwable cause = ioe.getCause();
381       if (ioe instanceof ConnectException) {
382         // Catch. Connect refused.
383       } else if (cause != null && cause instanceof EOFException) {
384         // Catch. Other end disconnected us.
385       } else if (cause != null && cause.getMessage() != null &&
386         cause.getMessage().toLowerCase().contains("connection reset")) {
387         // Catch. Connection reset.
388       } else {
389         throw ioe;
390       }
391 
392     }
393     return service;
394   }
395 
396   /**
397    * Verify we can connect to <code>hostingServer</code> and that its carrying
398    * <code>regionName</code>.
399    * @param hostingServer Interface to the server hosting <code>regionName</code>
400    * @param address The servername that goes with the <code>metaServer</code>
401    * Interface.  Used logging.
402    * @param regionName The regionname we are interested in.
403    * @return True if we were able to verify the region located at other side of
404    * the Interface.
405    * @throws IOException
406    */
407   // TODO: We should be able to get the ServerName from the AdminProtocol
408   // rather than have to pass it in.  Its made awkward by the fact that the
409   // HRI is likely a proxy against remote server so the getServerName needs
410   // to be fixed to go to a local method or to a cache before we can do this.
411   private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
412       final ServerName address, final byte [] regionName)
413   throws IOException {
414     if (hostingServer == null) {
415       LOG.info("Passed hostingServer is null");
416       return false;
417     }
418     Throwable t = null;
419     try {
420       // Try and get regioninfo from the hosting server.
421       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
422     } catch (ConnectException e) {
423       t = e;
424     } catch (RetriesExhaustedException e) {
425       t = e;
426     } catch (RemoteException e) {
427       IOException ioe = e.unwrapRemoteException();
428       t = ioe;
429     } catch (IOException e) {
430       Throwable cause = e.getCause();
431       if (cause != null && cause instanceof EOFException) {
432         t = cause;
433       } else if (cause != null && cause.getMessage() != null
434           && cause.getMessage().contains("Connection reset")) {
435         t = cause;
436       } else {
437         t = e;
438       }
439     }
440     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
441       " at address=" + address + ", exception=" + t);
442     return false;
443   }
444 
445   /**
446    * Verify <code>hbase:meta</code> is deployed and accessible.
447    * @param timeout How long to wait on zk for meta address (passed through to
448    * the internal call to {@link #waitForMetaServerConnection(long)}.
449    * @return True if the <code>hbase:meta</code> location is healthy.
450    * @throws IOException
451    * @throws InterruptedException
452    */
453   public boolean verifyMetaRegionLocation(final long timeout)
454   throws InterruptedException, IOException {
455     AdminService.BlockingInterface service = null;
456     try {
457       service = waitForMetaServerConnection(timeout);
458     } catch (NotAllMetaRegionsOnlineException e) {
459       // Pass
460     } catch (ServerNotRunningYetException e) {
461       // Pass -- remote server is not up so can't be carrying root
462     } catch (UnknownHostException e) {
463       // Pass -- server name doesn't resolve so it can't be assigned anything.
464     } catch (RegionServerStoppedException e) {
465       // Pass -- server name sends us to a server that is dying or already dead.
466     }
467     return (service == null)? false:
468       verifyRegionLocation(service,
469           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
470   }
471 
472   public HConnection getConnection() {
473     return this.connection;
474   }
475 }