View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import static org.junit.Assert.*;
22  
23  import java.io.IOException;
24  import java.util.List;
25  import java.util.NavigableSet;
26  import java.util.Set;
27  import java.util.TreeSet;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.conf.Configuration;
32  import org.apache.hadoop.hbase.*;
33  import org.apache.hadoop.hbase.client.HTable;
34  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35  import org.apache.hadoop.hbase.testclassification.LargeTests;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
38  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
39  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
40  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
41  import org.apache.zookeeper.KeeperException;
42  import org.junit.Test;
43  import org.junit.experimental.categories.Category;
44  
45  /**
46   * Tests the restarting of everything as done during rolling restarts.
47   */
48  @Category(LargeTests.class)
49  public class  TestRollingRestart {
50    private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
51  
52    @Test (timeout=500000)
53    public void testBasicRollingRestart() throws Exception {
54  
55      // Start a cluster with 2 masters and 4 regionservers
56      final int NUM_MASTERS = 2;
57      final int NUM_RS = 3;
58      final int NUM_REGIONS_TO_CREATE = 20;
59  
60      int expectedNumRS = 3;
61  
62      // Start the cluster
63      log("Starting cluster");
64      Configuration conf = HBaseConfiguration.create();
65      conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
66      conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
67      HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
68      TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
69      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
70      log("Waiting for active/ready master");
71      cluster.waitForActiveAndReadyMaster();
72      ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
73          null);
74      HMaster master = cluster.getMaster();
75  
76      // Create a table with regions
77      byte [] table = Bytes.toBytes("tableRestart");
78      byte [] family = Bytes.toBytes("family");
79      log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
80      HTable ht = TEST_UTIL.createTable(table, family);
81      int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
82          NUM_REGIONS_TO_CREATE);
83      numRegions += 1; // catalogs
84      log("Waiting for no more RIT\n");
85      blockUntilNoRIT(zkw, master);
86      log("Disabling table\n");
87      TEST_UTIL.getHBaseAdmin().disableTable(table);
88      log("Waiting for no more RIT\n");
89      blockUntilNoRIT(zkw, master);
90      NavigableSet<String> regions = getAllOnlineRegions(cluster);
91      log("Verifying only catalog and namespace regions are assigned\n");
92      if (regions.size() != 2) {
93        for (String oregion : regions) log("Region still online: " + oregion);
94      }
95      assertEquals(2, regions.size());
96      log("Enabling table\n");
97      TEST_UTIL.getHBaseAdmin().enableTable(table);
98      log("Waiting for no more RIT\n");
99      blockUntilNoRIT(zkw, master);
100     log("Verifying there are " + numRegions + " assigned on cluster\n");
101     regions = getAllOnlineRegions(cluster);
102     assertRegionsAssigned(cluster, regions);
103     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
104 
105     // Add a new regionserver
106     log("Adding a fourth RS");
107     RegionServerThread restarted = cluster.startRegionServer();
108     expectedNumRS++;
109     restarted.waitForServerOnline();
110     log("Additional RS is online");
111     log("Waiting for no more RIT");
112     blockUntilNoRIT(zkw, master);
113     log("Verifying there are " + numRegions + " assigned on cluster");
114     assertRegionsAssigned(cluster, regions);
115     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
116 
117     // Master Restarts
118     List<MasterThread> masterThreads = cluster.getMasterThreads();
119     MasterThread activeMaster = null;
120     MasterThread backupMaster = null;
121     assertEquals(2, masterThreads.size());
122     if (masterThreads.get(0).getMaster().isActiveMaster()) {
123       activeMaster = masterThreads.get(0);
124       backupMaster = masterThreads.get(1);
125     } else {
126       activeMaster = masterThreads.get(1);
127       backupMaster = masterThreads.get(0);
128     }
129 
130     // Bring down the backup master
131     log("Stopping backup master\n\n");
132     backupMaster.getMaster().stop("Stop of backup during rolling restart");
133     cluster.hbaseCluster.waitOnMaster(backupMaster);
134 
135     // Bring down the primary master
136     log("Stopping primary master\n\n");
137     activeMaster.getMaster().stop("Stop of active during rolling restart");
138     cluster.hbaseCluster.waitOnMaster(activeMaster);
139 
140     // Start primary master
141     log("Restarting primary master\n\n");
142     activeMaster = cluster.startMaster();
143     cluster.waitForActiveAndReadyMaster();
144     master = activeMaster.getMaster();
145 
146     // Start backup master
147     log("Restarting backup master\n\n");
148     backupMaster = cluster.startMaster();
149 
150     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
151 
152     // RegionServer Restarts
153 
154     // Bring them down, one at a time, waiting between each to complete
155     List<RegionServerThread> regionServers =
156       cluster.getLiveRegionServerThreads();
157     int num = 1;
158     int total = regionServers.size();
159     for (RegionServerThread rst : regionServers) {
160       ServerName serverName = rst.getRegionServer().getServerName();
161       log("Stopping region server " + num + " of " + total + " [ " +
162           serverName + "]");
163       rst.getRegionServer().stop("Stopping RS during rolling restart");
164       cluster.hbaseCluster.waitOnRegionServer(rst);
165       log("Waiting for RS shutdown to be handled by master");
166       waitForRSShutdownToStartAndFinish(activeMaster, serverName);
167       log("RS shutdown done, waiting for no more RIT");
168       blockUntilNoRIT(zkw, master);
169       log("Verifying there are " + numRegions + " assigned on cluster");
170       assertRegionsAssigned(cluster, regions);
171       expectedNumRS--;
172       assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
173       log("Restarting region server " + num + " of " + total);
174       restarted = cluster.startRegionServer();
175       restarted.waitForServerOnline();
176       expectedNumRS++;
177       log("Region server " + num + " is back online");
178       log("Waiting for no more RIT");
179       blockUntilNoRIT(zkw, master);
180       log("Verifying there are " + numRegions + " assigned on cluster");
181       assertRegionsAssigned(cluster, regions);
182       assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
183       num++;
184     }
185     Thread.sleep(1000);
186     assertRegionsAssigned(cluster, regions);
187 
188     // Bring the RS hosting hbase:meta down
189     RegionServerThread metaServer = getServerHostingMeta(cluster);
190     log("Stopping server hosting hbase:meta #1");
191     metaServer.getRegionServer().stop("Stopping hbase:meta server");
192     cluster.hbaseCluster.waitOnRegionServer(metaServer);
193     log("Meta server down #1");
194     expectedNumRS--;
195     log("Waiting for meta server #1 RS shutdown to be handled by master");
196     waitForRSShutdownToStartAndFinish(activeMaster,
197         metaServer.getRegionServer().getServerName());
198     log("Waiting for no more RIT");
199     long start = System.currentTimeMillis();
200     do {
201       blockUntilNoRIT(zkw, master);
202     } while (getNumberOfOnlineRegions(cluster) < numRegions 
203         && System.currentTimeMillis()-start < 60000);
204     log("Verifying there are " + numRegions + " assigned on cluster");
205     assertRegionsAssigned(cluster, regions);
206     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
207 
208     // Kill off the server hosting hbase:meta again
209     metaServer = getServerHostingMeta(cluster);
210     log("Stopping server hosting hbase:meta #2");
211     metaServer.getRegionServer().stop("Stopping hbase:meta server");
212     cluster.hbaseCluster.waitOnRegionServer(metaServer);
213     log("Meta server down");
214     expectedNumRS--;
215     log("Waiting for RS shutdown to be handled by master");
216     waitForRSShutdownToStartAndFinish(activeMaster,
217         metaServer.getRegionServer().getServerName());
218     log("RS shutdown done, waiting for no more RIT");
219     blockUntilNoRIT(zkw, master);
220     log("Verifying there are " + numRegions + " assigned on cluster");
221     assertRegionsAssigned(cluster, regions);
222     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
223 
224     // Start 3 RS again
225     cluster.startRegionServer().waitForServerOnline();
226     cluster.startRegionServer().waitForServerOnline();
227     cluster.startRegionServer().waitForServerOnline();
228     Thread.sleep(1000);
229     log("Waiting for no more RIT");
230     blockUntilNoRIT(zkw, master);
231     log("Verifying there are " + numRegions + " assigned on cluster");
232     assertRegionsAssigned(cluster, regions);
233     // Shutdown server hosting META
234     metaServer = getServerHostingMeta(cluster);
235     log("Stopping server hosting hbase:meta (1 of 3)");
236     metaServer.getRegionServer().stop("Stopping hbase:meta server");
237     cluster.hbaseCluster.waitOnRegionServer(metaServer);
238     log("Meta server down (1 of 3)");
239     log("Waiting for RS shutdown to be handled by master");
240     waitForRSShutdownToStartAndFinish(activeMaster,
241         metaServer.getRegionServer().getServerName());
242     log("RS shutdown done, waiting for no more RIT");
243     blockUntilNoRIT(zkw, master);
244     log("Verifying there are " + numRegions + " assigned on cluster");
245     assertRegionsAssigned(cluster, regions);
246 
247     // Shutdown server hosting hbase:meta again
248     metaServer = getServerHostingMeta(cluster);
249     log("Stopping server hosting hbase:meta (2 of 3)");
250     metaServer.getRegionServer().stop("Stopping hbase:meta server");
251     cluster.hbaseCluster.waitOnRegionServer(metaServer);
252     log("Meta server down (2 of 3)");
253     log("Waiting for RS shutdown to be handled by master");
254     waitForRSShutdownToStartAndFinish(activeMaster,
255         metaServer.getRegionServer().getServerName());
256     log("RS shutdown done, waiting for no more RIT");
257     blockUntilNoRIT(zkw, master);
258     log("Verifying there are " + numRegions + " assigned on cluster");
259     assertRegionsAssigned(cluster, regions);
260 
261     // Shutdown server hosting hbase:meta again
262     metaServer = getServerHostingMeta(cluster);
263     log("Stopping server hosting hbase:meta (3 of 3)");
264     metaServer.getRegionServer().stop("Stopping hbase:meta server");
265     cluster.hbaseCluster.waitOnRegionServer(metaServer);
266     log("Meta server down (3 of 3)");
267     log("Waiting for RS shutdown to be handled by master");
268     waitForRSShutdownToStartAndFinish(activeMaster,
269         metaServer.getRegionServer().getServerName());
270     log("RS shutdown done, waiting for no more RIT");
271     blockUntilNoRIT(zkw, master);
272     log("Verifying there are " + numRegions + " assigned on cluster");
273     assertRegionsAssigned(cluster, regions);
274 
275     if (cluster.getRegionServerThreads().size() != 1) {
276       log("Online regionservers:");
277       for (RegionServerThread rst : cluster.getRegionServerThreads()) {
278         log("RS: " + rst.getRegionServer().getServerName());
279       }
280     }
281     assertEquals(2, cluster.getRegionServerThreads().size());
282 
283 
284     // TODO: Bring random 3 of 4 RS down at the same time
285 
286     ht.close();
287     // Stop the cluster
288     TEST_UTIL.shutdownMiniCluster();
289   }
290 
291   private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
292   throws KeeperException, InterruptedException {
293     ZKAssign.blockUntilNoRIT(zkw);
294     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
295   }
296 
297   private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
298       ServerName serverName) throws InterruptedException {
299     ServerManager sm = activeMaster.getMaster().getServerManager();
300     // First wait for it to be in dead list
301     while (!sm.getDeadServers().isDeadServer(serverName)) {
302       log("Waiting for [" + serverName + "] to be listed as dead in master");
303       Thread.sleep(1);
304     }
305     log("Server [" + serverName + "] marked as dead, waiting for it to " +
306         "finish dead processing");
307     while (sm.areDeadServersInProgress()) {
308       log("Server [" + serverName + "] still being processed, waiting");
309       Thread.sleep(100);
310     }
311     log("Server [" + serverName + "] done with server shutdown processing");
312   }
313 
314   private void log(String msg) {
315     LOG.debug("\n\nTRR: " + msg + "\n");
316   }
317 
318   private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster)
319       throws IOException {
320     return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
321   }
322 
323   private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
324       HRegionInfo region) throws IOException {
325     for (RegionServerThread rst : cluster.getRegionServerThreads()) {
326       if (ProtobufUtil.getOnlineRegions(rst.getRegionServer()).contains(region)) {
327         return rst;
328       }
329     }
330     return null;
331   }
332 
333   private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
334     int numFound = 0;
335     for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
336       numFound += rst.getRegionServer().getNumberOfOnlineRegions();
337     }
338     return numFound;
339   }
340   
341   private void assertRegionsAssigned(MiniHBaseCluster cluster,
342       Set<String> expectedRegions) throws IOException {
343     int numFound = getNumberOfOnlineRegions(cluster);
344     if (expectedRegions.size() > numFound) {
345       log("Expected to find " + expectedRegions.size() + " but only found"
346           + " " + numFound);
347       NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
348       for (String region : expectedRegions) {
349         if (!foundRegions.contains(region)) {
350           log("Missing region: " + region);
351         }
352       }
353       assertEquals(expectedRegions.size(), numFound);
354     } else if (expectedRegions.size() < numFound) {
355       int doubled = numFound - expectedRegions.size();
356       log("Expected to find " + expectedRegions.size() + " but found"
357           + " " + numFound + " (" + doubled + " double assignments?)");
358       NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
359       for (String region : doubleRegions) {
360         log("Region is double assigned: " + region);
361       }
362       assertEquals(expectedRegions.size(), numFound);
363     } else {
364       log("Success!  Found expected number of " + numFound + " regions");
365     }
366   }
367 
368   private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster)
369       throws IOException {
370     NavigableSet<String> online = new TreeSet<String>();
371     for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
372       for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
373         online.add(region.getRegionNameAsString());
374       }
375     }
376     return online;
377   }
378 
379   private NavigableSet<String> getDoubleAssignedRegions(
380       MiniHBaseCluster cluster) throws IOException {
381     NavigableSet<String> online = new TreeSet<String>();
382     NavigableSet<String> doubled = new TreeSet<String>();
383     for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
384       for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
385         if(!online.add(region.getRegionNameAsString())) {
386           doubled.add(region.getRegionNameAsString());
387         }
388       }
389     }
390     return doubled;
391   }
392 
393 
394 }
395