View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.concurrent.Callable;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.Executors;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ScheduledThreadPoolExecutor;
45  import java.util.concurrent.SynchronousQueue;
46  import java.util.concurrent.ThreadPoolExecutor;
47  import java.util.concurrent.TimeUnit;
48  import java.util.concurrent.atomic.AtomicBoolean;
49  
50  import org.apache.commons.io.IOUtils;
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.conf.Configuration;
54  import org.apache.hadoop.fs.FileStatus;
55  import org.apache.hadoop.fs.FileSystem;
56  import org.apache.hadoop.fs.Path;
57  import org.apache.hadoop.hbase.ClusterStatus;
58  import org.apache.hadoop.hbase.HBaseTestingUtility;
59  import org.apache.hadoop.hbase.HColumnDescriptor;
60  import org.apache.hadoop.hbase.HConstants;
61  import org.apache.hadoop.hbase.HRegionInfo;
62  import org.apache.hadoop.hbase.HRegionLocation;
63  import org.apache.hadoop.hbase.HTableDescriptor;
64  import org.apache.hadoop.hbase.TableExistsException;
65  import org.apache.hadoop.hbase.testclassification.LargeTests;
66  import org.apache.hadoop.hbase.MiniHBaseCluster;
67  import org.apache.hadoop.hbase.ServerName;
68  import org.apache.hadoop.hbase.TableName;
69  import org.apache.hadoop.hbase.catalog.MetaEditor;
70  import org.apache.hadoop.hbase.client.Delete;
71  import org.apache.hadoop.hbase.client.Durability;
72  import org.apache.hadoop.hbase.client.Get;
73  import org.apache.hadoop.hbase.client.HBaseAdmin;
74  import org.apache.hadoop.hbase.client.HConnection;
75  import org.apache.hadoop.hbase.client.HConnectionManager;
76  import org.apache.hadoop.hbase.client.HTable;
77  import org.apache.hadoop.hbase.client.MetaScanner;
78  import org.apache.hadoop.hbase.client.Put;
79  import org.apache.hadoop.hbase.client.Result;
80  import org.apache.hadoop.hbase.client.ResultScanner;
81  import org.apache.hadoop.hbase.client.Scan;
82  import org.apache.hadoop.hbase.io.hfile.TestHFile;
83  import org.apache.hadoop.hbase.master.AssignmentManager;
84  import org.apache.hadoop.hbase.master.HMaster;
85  import org.apache.hadoop.hbase.master.RegionState;
86  import org.apache.hadoop.hbase.master.RegionStates;
87  import org.apache.hadoop.hbase.master.TableLockManager;
88  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
89  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
90  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
91  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
92  import org.apache.hadoop.hbase.regionserver.HRegion;
93  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
94  import org.apache.hadoop.hbase.regionserver.HRegionServer;
95  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
96  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
97  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
98  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
99  import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
100 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
101 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
102 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
103 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
104 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
105 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
106 import org.apache.zookeeper.KeeperException;
107 import org.junit.AfterClass;
108 import org.junit.Assert;
109 import org.junit.Before;
110 import org.junit.BeforeClass;
111 import org.junit.Ignore;
112 import org.junit.Test;
113 import org.junit.experimental.categories.Category;
114 import org.junit.rules.TestName;
115 
116 import com.google.common.collect.Multimap;
117 
118 /**
119  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
120  */
121 @Category(LargeTests.class)
122 public class TestHBaseFsck {
123   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
124   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
125   private final static Configuration conf = TEST_UTIL.getConfiguration();
126   private final static String FAM_STR = "fam";
127   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
128   private final static int REGION_ONLINE_TIMEOUT = 800;
129   private static RegionStates regionStates;
130   private static ExecutorService executorService;
131 
132   // for the instance, reset every test run
133   private HTable tbl;
134   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
135     Bytes.toBytes("B"), Bytes.toBytes("C") };
136   // one row per region.
137   private final static byte[][] ROWKEYS= new byte[][] {
138     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
139     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
140 
141   @SuppressWarnings("deprecation")
142   @BeforeClass
143   public static void setUpBeforeClass() throws Exception {
144     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
145     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
146     TEST_UTIL.getConfiguration().setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
147     TEST_UTIL.startMiniCluster(3);
148     TEST_UTIL.setHDFSClientRetry(0);
149 
150     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
151         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
152 
153     AssignmentManager assignmentManager =
154       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
155     regionStates = assignmentManager.getRegionStates();
156     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
157   }
158 
159   @AfterClass
160   public static void tearDownAfterClass() throws Exception {
161     TEST_UTIL.shutdownMiniCluster();
162   }
163 
164   @Test
165   public void testHBaseFsck() throws Exception {
166     assertNoErrors(doFsck(conf, false));
167     String table = "tableBadMetaAssign";
168     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
169 
170     // We created 1 table, should be fine
171     assertNoErrors(doFsck(conf, false));
172 
173     // Now let's mess it up and change the assignment in hbase:meta to
174     // point to a different region server
175     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
176     Scan scan = new Scan();
177     scan.setStartRow(Bytes.toBytes(table+",,"));
178     ResultScanner scanner = meta.getScanner(scan);
179     HRegionInfo hri = null;
180 
181     Result res = scanner.next();
182     ServerName currServer =
183       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
184           HConstants.SERVER_QUALIFIER));
185     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
186         HConstants.STARTCODE_QUALIFIER));
187 
188     for (JVMClusterUtil.RegionServerThread rs :
189         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
190 
191       ServerName sn = rs.getRegionServer().getServerName();
192 
193       // When we find a diff RS, change the assignment and break
194       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
195           startCode != sn.getStartcode()) {
196         Put put = new Put(res.getRow());
197         put.setDurability(Durability.SKIP_WAL);
198         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
199           Bytes.toBytes(sn.getHostAndPort()));
200         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
201           Bytes.toBytes(sn.getStartcode()));
202         meta.put(put);
203         hri = HRegionInfo.getHRegionInfo(res);
204         break;
205       }
206     }
207 
208     // Try to fix the data
209     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
210         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
211 
212     TEST_UTIL.getHBaseCluster().getMaster()
213       .getAssignmentManager().waitForAssignment(hri);
214 
215     // Should be fixed now
216     assertNoErrors(doFsck(conf, false));
217 
218     // comment needed - what is the purpose of this line
219     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
220     ResultScanner s = t.getScanner(new Scan());
221     s.close();
222     t.close();
223 
224     scanner.close();
225     meta.close();
226   }
227 
228   @Test(timeout=180000)
229   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
230     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
231     HBaseAdmin admin = null;
232     try {
233       admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
234       admin.closeRegion(cluster.getServerHoldingMeta(),
235           HRegionInfo.FIRST_META_REGIONINFO);
236     } finally {
237       if (admin != null) {
238         admin.close();
239       }
240     }
241     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
242     MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
243     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
244     HBaseFsck hbck = doFsck(conf, true);
245     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
246         ERROR_CODE.NULL_META_REGION });
247     assertNoErrors(doFsck(conf, false));
248   }
249 
250   /**
251    * Create a new region in META.
252    */
253   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
254       htd, byte[] startKey, byte[] endKey)
255       throws IOException {
256     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
257     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
258     MetaEditor.addRegionToMeta(meta, hri);
259     meta.close();
260     return hri;
261   }
262 
263   /**
264    * Debugging method to dump the contents of meta.
265    */
266   private void dumpMeta(TableName tableName) throws IOException {
267     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
268     for (byte[] row : metaRows) {
269       LOG.info(Bytes.toString(row));
270     }
271   }
272 
273   /**
274    * This method is used to undeploy a region -- close it and attempt to
275    * remove its state from the Master.
276    */
277   private void undeployRegion(HBaseAdmin admin, ServerName sn,
278       HRegionInfo hri) throws IOException, InterruptedException {
279     try {
280       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
281       if (!hri.isMetaTable()) {
282         admin.offline(hri.getRegionName());
283       }
284     } catch (IOException ioe) {
285       LOG.warn("Got exception when attempting to offline region "
286           + Bytes.toString(hri.getRegionName()), ioe);
287     }
288   }
289   /**
290    * Delete a region from assignments, meta, or completely from hdfs.
291    * @param unassign if true unassign region if assigned
292    * @param metaRow  if true remove region's row from META
293    * @param hdfs if true remove region's dir in HDFS
294    */
295   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
296       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
297       boolean hdfs) throws IOException, InterruptedException {
298     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
299   }
300 
301   /**
302    * Delete a region from assignments, meta, or completely from hdfs.
303    * @param unassign if true unassign region if assigned
304    * @param metaRow  if true remove region's row from META
305    * @param hdfs if true remove region's dir in HDFS
306    * @param regionInfoOnly if true remove a region dir's .regioninfo file
307    */
308   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
309       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
310       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
311     LOG.info("** Before delete:");
312     dumpMeta(htd.getTableName());
313 
314     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
315     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
316       HRegionInfo hri = e.getKey();
317       ServerName hsa = e.getValue();
318       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
319           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
320 
321         LOG.info("RegionName: " +hri.getRegionNameAsString());
322         byte[] deleteRow = hri.getRegionName();
323 
324         if (unassign) {
325           LOG.info("Undeploying region " + hri + " from server " + hsa);
326           undeployRegion(new HBaseAdmin(conf), hsa, hri);
327         }
328 
329         if (regionInfoOnly) {
330           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
331           Path rootDir = FSUtils.getRootDir(conf);
332           FileSystem fs = rootDir.getFileSystem(conf);
333           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
334               hri.getEncodedName());
335           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
336           fs.delete(hriPath, true);
337         }
338 
339         if (hdfs) {
340           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
341           Path rootDir = FSUtils.getRootDir(conf);
342           FileSystem fs = rootDir.getFileSystem(conf);
343           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
344               hri.getEncodedName());
345           HBaseFsck.debugLsr(conf, p);
346           boolean success = fs.delete(p, true);
347           LOG.info("Deleted " + p + " sucessfully? " + success);
348           HBaseFsck.debugLsr(conf, p);
349         }
350 
351         if (metaRow) {
352           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
353           Delete delete = new Delete(deleteRow);
354           meta.delete(delete);
355         }
356       }
357       LOG.info(hri.toString() + hsa.toString());
358     }
359 
360     TEST_UTIL.getMetaTableRows(htd.getTableName());
361     LOG.info("*** After delete:");
362     dumpMeta(htd.getTableName());
363   }
364 
365   /**
366    * Setup a clean table before we start mucking with it.
367    *
368    * @throws IOException
369    * @throws InterruptedException
370    * @throws KeeperException
371    */
372   HTable setupTable(TableName tablename) throws Exception {
373     HTableDescriptor desc = new HTableDescriptor(tablename);
374     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
375     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
376     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
377     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
378 
379     List<Put> puts = new ArrayList<Put>();
380     for (byte[] row : ROWKEYS) {
381       Put p = new Put(row);
382       p.add(FAM, Bytes.toBytes("val"), row);
383       puts.add(p);
384     }
385     tbl.put(puts);
386     tbl.flushCommits();
387     return tbl;
388   }
389 
390   /**
391    * Counts the number of row to verify data loss or non-dataloss.
392    */
393   int countRows() throws IOException {
394      Scan s = new Scan();
395      ResultScanner rs = tbl.getScanner(s);
396      int i = 0;
397      while(rs.next() !=null) {
398        i++;
399      }
400      return i;
401   }
402 
403   /**
404    * delete table in preparation for next test
405    *
406    * @param tablename
407    * @throws IOException
408    */
409   void deleteTable(TableName tablename) throws IOException {
410     HBaseAdmin admin = new HBaseAdmin(conf);
411     admin.getConnection().clearRegionCache();
412     if (admin.isTableEnabled(tablename)) {
413       admin.disableTableAsync(tablename);
414     }
415     long totalWait = 0;
416     long maxWait = 30*1000;
417     long sleepTime = 250;
418     while (!admin.isTableDisabled(tablename)) {
419       try {
420         Thread.sleep(sleepTime);
421         totalWait += sleepTime;
422         if (totalWait >= maxWait) {
423           fail("Waited too long for table to be disabled + " + tablename);
424         }
425       } catch (InterruptedException e) {
426         e.printStackTrace();
427         fail("Interrupted when trying to disable table " + tablename);
428       }
429     }
430     admin.deleteTable(tablename);
431   }
432 
433   /**
434    * This creates a clean table and confirms that the table is clean.
435    */
436   @Test
437   public void testHBaseFsckClean() throws Exception {
438     assertNoErrors(doFsck(conf, false));
439     TableName table = TableName.valueOf("tableClean");
440     try {
441       HBaseFsck hbck = doFsck(conf, false);
442       assertNoErrors(hbck);
443 
444       setupTable(table);
445       assertEquals(ROWKEYS.length, countRows());
446 
447       // We created 1 table, should be fine
448       hbck = doFsck(conf, false);
449       assertNoErrors(hbck);
450       assertEquals(0, hbck.getOverlapGroups(table).size());
451       assertEquals(ROWKEYS.length, countRows());
452     } finally {
453       deleteTable(table);
454     }
455   }
456 
457   /**
458    * Test thread pooling in the case where there are more regions than threads
459    */
460   @Test
461   public void testHbckThreadpooling() throws Exception {
462     TableName table =
463         TableName.valueOf("tableDupeStartKey");
464     try {
465       // Create table with 4 regions
466       setupTable(table);
467 
468       // limit number of threads to 1.
469       Configuration newconf = new Configuration(conf);
470       newconf.setInt("hbasefsck.numthreads", 1);
471       assertNoErrors(doFsck(newconf, false));
472 
473       // We should pass without triggering a RejectedExecutionException
474     } finally {
475       deleteTable(table);
476     }
477   }
478 
479   @Test
480   public void testHbckFixOrphanTable() throws Exception {
481     TableName table = TableName.valueOf("tableInfo");
482     FileSystem fs = null;
483     Path tableinfo = null;
484     try {
485       setupTable(table);
486       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
487 
488       Path hbaseTableDir = FSUtils.getTableDir(
489           FSUtils.getRootDir(conf), table);
490       fs = hbaseTableDir.getFileSystem(conf);
491       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
492       tableinfo = status.getPath();
493       fs.rename(tableinfo, new Path("/.tableinfo"));
494 
495       //to report error if .tableinfo is missing.
496       HBaseFsck hbck = doFsck(conf, false);
497       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
498 
499       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
500       hbck = doFsck(conf, true);
501       assertNoErrors(hbck);
502       status = null;
503       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
504       assertNotNull(status);
505 
506       HTableDescriptor htd = admin.getTableDescriptor(table);
507       htd.setValue("NOT_DEFAULT", "true");
508       admin.disableTable(table);
509       admin.modifyTable(table, htd);
510       admin.enableTable(table);
511       fs.delete(status.getPath(), true);
512 
513       // fix OrphanTable with cache
514       htd = admin.getTableDescriptor(table); // warms up cached htd on master
515       hbck = doFsck(conf, true);
516       assertNoErrors(hbck);
517       status = null;
518       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
519       assertNotNull(status);
520       htd = admin.getTableDescriptor(table);
521       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
522     } finally {
523       fs.rename(new Path("/.tableinfo"), tableinfo);
524       deleteTable(table);
525     }
526   }
527 
528   /**
529    * This test makes sure that parallel instances of Hbck is disabled.
530    *
531    * @throws Exception
532    */
533   @Test
534   public void testParallelHbck() throws Exception {
535     final ExecutorService service;
536     final Future<HBaseFsck> hbck1,hbck2;
537 
538     class RunHbck implements Callable<HBaseFsck>{
539       boolean fail = true;
540       @Override
541       public HBaseFsck call(){
542         try{
543           return doFsck(conf, false);
544         } catch(Exception e){
545           if (e.getMessage().contains("Duplicate hbck")) {
546             fail = false;
547           } else {
548             LOG.fatal("hbck failed.", e);
549           }
550         }
551         // If we reach here, then an exception was caught
552         if (fail) fail();
553         return null;
554       }
555     }
556     service = Executors.newFixedThreadPool(2);
557     hbck1 = service.submit(new RunHbck());
558     hbck2 = service.submit(new RunHbck());
559     service.shutdown();
560     //wait for 15 seconds, for both hbck calls finish
561     service.awaitTermination(15, TimeUnit.SECONDS);
562     HBaseFsck h1 = hbck1.get();
563     HBaseFsck h2 = hbck2.get();
564     // Make sure only one of the calls was successful
565     assert(h1 == null || h2 == null);
566     if (h1 != null) {
567       assert(h1.getRetCode() >= 0);
568     }
569     if (h2 != null) {
570       assert(h2.getRetCode() >= 0);
571     }
572   }
573 
574   /**
575    * This create and fixes a bad table with regions that have a duplicate
576    * start key
577    */
578   @Test
579   public void testDupeStartKey() throws Exception {
580     TableName table =
581         TableName.valueOf("tableDupeStartKey");
582     try {
583       setupTable(table);
584       assertNoErrors(doFsck(conf, false));
585       assertEquals(ROWKEYS.length, countRows());
586 
587       // Now let's mess it up, by adding a region with a duplicate startkey
588       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
589           Bytes.toBytes("A"), Bytes.toBytes("A2"));
590       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
591       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
592           .waitForAssignment(hriDupe);
593       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
594       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
595 
596       HBaseFsck hbck = doFsck(conf, false);
597       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
598             ERROR_CODE.DUPE_STARTKEYS});
599       assertEquals(2, hbck.getOverlapGroups(table).size());
600       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
601 
602       // fix the degenerate region.
603       doFsck(conf,true);
604 
605       // check that the degenerate region is gone and no data loss
606       HBaseFsck hbck2 = doFsck(conf,false);
607       assertNoErrors(hbck2);
608       assertEquals(0, hbck2.getOverlapGroups(table).size());
609       assertEquals(ROWKEYS.length, countRows());
610     } finally {
611       deleteTable(table);
612     }
613   }
614 
615   /**
616    * Get region info from local cluster.
617    */
618   Map<ServerName, List<String>> getDeployedHRIs(
619       final HBaseAdmin admin) throws IOException {
620     ClusterStatus status = admin.getClusterStatus();
621     Collection<ServerName> regionServers = status.getServers();
622     Map<ServerName, List<String>> mm =
623         new HashMap<ServerName, List<String>>();
624     HConnection connection = admin.getConnection();
625     for (ServerName hsi : regionServers) {
626       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
627 
628       // list all online regions from this region server
629       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
630       List<String> regionNames = new ArrayList<String>();
631       for (HRegionInfo hri : regions) {
632         regionNames.add(hri.getRegionNameAsString());
633       }
634       mm.put(hsi, regionNames);
635     }
636     return mm;
637   }
638 
639   /**
640    * Returns the HSI a region info is on.
641    */
642   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
643     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
644       if (e.getValue().contains(hri.getRegionNameAsString())) {
645         return e.getKey();
646       }
647     }
648     return null;
649   }
650 
651   /**
652    * This create and fixes a bad table with regions that have a duplicate
653    * start key
654    */
655   @Test
656   public void testDupeRegion() throws Exception {
657     TableName table =
658         TableName.valueOf("tableDupeRegion");
659     try {
660       setupTable(table);
661       assertNoErrors(doFsck(conf, false));
662       assertEquals(ROWKEYS.length, countRows());
663 
664       // Now let's mess it up, by adding a region with a duplicate startkey
665       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
666           Bytes.toBytes("A"), Bytes.toBytes("B"));
667 
668       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
669       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
670           .waitForAssignment(hriDupe);
671       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
672       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
673 
674       // Yikes! The assignment manager can't tell between diff between two
675       // different regions with the same start/endkeys since it doesn't
676       // differentiate on ts/regionId!  We actually need to recheck
677       // deployments!
678       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
679       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
680         Thread.sleep(250);
681       }
682 
683       LOG.debug("Finished assignment of dupe region");
684 
685       // TODO why is dupe region different from dupe start keys?
686       HBaseFsck hbck = doFsck(conf, false);
687       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
688             ERROR_CODE.DUPE_STARTKEYS});
689       assertEquals(2, hbck.getOverlapGroups(table).size());
690       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
691 
692       // fix the degenerate region.
693       doFsck(conf,true);
694 
695       // check that the degenerate region is gone and no data loss
696       HBaseFsck hbck2 = doFsck(conf,false);
697       assertNoErrors(hbck2);
698       assertEquals(0, hbck2.getOverlapGroups(table).size());
699       assertEquals(ROWKEYS.length, countRows());
700     } finally {
701       deleteTable(table);
702     }
703   }
704 
705   /**
706    * This creates and fixes a bad table with regions that has startkey == endkey
707    */
708   @Test
709   public void testDegenerateRegions() throws Exception {
710     TableName table =
711         TableName.valueOf("tableDegenerateRegions");
712     try {
713       setupTable(table);
714       assertNoErrors(doFsck(conf,false));
715       assertEquals(ROWKEYS.length, countRows());
716 
717       // Now let's mess it up, by adding a region with a duplicate startkey
718       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
719           Bytes.toBytes("B"), Bytes.toBytes("B"));
720       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
721       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
722           .waitForAssignment(hriDupe);
723       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
724       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
725 
726       HBaseFsck hbck = doFsck(conf,false);
727       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
728           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
729       assertEquals(2, hbck.getOverlapGroups(table).size());
730       assertEquals(ROWKEYS.length, countRows());
731 
732       // fix the degenerate region.
733       doFsck(conf,true);
734 
735       // check that the degenerate region is gone and no data loss
736       HBaseFsck hbck2 = doFsck(conf,false);
737       assertNoErrors(hbck2);
738       assertEquals(0, hbck2.getOverlapGroups(table).size());
739       assertEquals(ROWKEYS.length, countRows());
740     } finally {
741       deleteTable(table);
742     }
743   }
744 
745   /**
746    * This creates and fixes a bad table where a region is completely contained
747    * by another region.
748    */
749   @Test
750   public void testContainedRegionOverlap() throws Exception {
751     TableName table =
752         TableName.valueOf("tableContainedRegionOverlap");
753     try {
754       setupTable(table);
755       assertEquals(ROWKEYS.length, countRows());
756 
757       // Mess it up by creating an overlap in the metadata
758       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
759           Bytes.toBytes("A2"), Bytes.toBytes("B"));
760       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
761       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
762           .waitForAssignment(hriOverlap);
763       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
764       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
765 
766       HBaseFsck hbck = doFsck(conf, false);
767       assertErrors(hbck, new ERROR_CODE[] {
768           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
769       assertEquals(2, hbck.getOverlapGroups(table).size());
770       assertEquals(ROWKEYS.length, countRows());
771 
772       // fix the problem.
773       doFsck(conf, true);
774 
775       // verify that overlaps are fixed
776       HBaseFsck hbck2 = doFsck(conf,false);
777       assertNoErrors(hbck2);
778       assertEquals(0, hbck2.getOverlapGroups(table).size());
779       assertEquals(ROWKEYS.length, countRows());
780     } finally {
781        deleteTable(table);
782     }
783   }
784 
785   /**
786    * This creates and fixes a bad table where an overlap group of
787    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
788    * region. Mess around the meta data so that closeRegion/offlineRegion
789    * throws exceptions.
790    */
791   @Test
792   public void testSidelineOverlapRegion() throws Exception {
793     TableName table =
794         TableName.valueOf("testSidelineOverlapRegion");
795     try {
796       setupTable(table);
797       assertEquals(ROWKEYS.length, countRows());
798 
799       // Mess it up by creating an overlap
800       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
801       HMaster master = cluster.getMaster();
802       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
803         Bytes.toBytes("A"), Bytes.toBytes("AB"));
804       master.assignRegion(hriOverlap1);
805       master.getAssignmentManager().waitForAssignment(hriOverlap1);
806       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
807         Bytes.toBytes("AB"), Bytes.toBytes("B"));
808       master.assignRegion(hriOverlap2);
809       master.getAssignmentManager().waitForAssignment(hriOverlap2);
810 
811       HBaseFsck hbck = doFsck(conf, false);
812       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
813         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
814       assertEquals(3, hbck.getOverlapGroups(table).size());
815       assertEquals(ROWKEYS.length, countRows());
816 
817       // mess around the overlapped regions, to trigger NotServingRegionException
818       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
819       ServerName serverName = null;
820       byte[] regionName = null;
821       for (HbckInfo hbi: overlapGroups.values()) {
822         if ("A".equals(Bytes.toString(hbi.getStartKey()))
823             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
824           regionName = hbi.getRegionName();
825 
826           // get an RS not serving the region to force bad assignment info in to META.
827           int k = cluster.getServerWith(regionName);
828           for (int i = 0; i < 3; i++) {
829             if (i != k) {
830               HRegionServer rs = cluster.getRegionServer(i);
831               serverName = rs.getServerName();
832               break;
833             }
834           }
835 
836           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
837           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
838             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
839           admin.offline(regionName);
840           break;
841         }
842       }
843 
844       assertNotNull(regionName);
845       assertNotNull(serverName);
846       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
847       Put put = new Put(regionName);
848       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
849         Bytes.toBytes(serverName.getHostAndPort()));
850       meta.put(put);
851 
852       // fix the problem.
853       HBaseFsck fsck = new HBaseFsck(conf);
854       fsck.connect();
855       fsck.setDisplayFullReport(); // i.e. -details
856       fsck.setTimeLag(0);
857       fsck.setFixAssignments(true);
858       fsck.setFixMeta(true);
859       fsck.setFixHdfsHoles(true);
860       fsck.setFixHdfsOverlaps(true);
861       fsck.setFixHdfsOrphans(true);
862       fsck.setFixVersionFile(true);
863       fsck.setSidelineBigOverlaps(true);
864       fsck.setMaxMerge(2);
865       fsck.onlineHbck();
866 
867       // verify that overlaps are fixed, and there are less rows
868       // since one region is sidelined.
869       HBaseFsck hbck2 = doFsck(conf,false);
870       assertNoErrors(hbck2);
871       assertEquals(0, hbck2.getOverlapGroups(table).size());
872       assertTrue(ROWKEYS.length > countRows());
873     } finally {
874        deleteTable(table);
875     }
876   }
877 
878   /**
879    * This creates and fixes a bad table where a region is completely contained
880    * by another region, and there is a hole (sort of like a bad split)
881    */
882   @Test
883   public void testOverlapAndOrphan() throws Exception {
884     TableName table =
885         TableName.valueOf("tableOverlapAndOrphan");
886     try {
887       setupTable(table);
888       assertEquals(ROWKEYS.length, countRows());
889 
890       // Mess it up by creating an overlap in the metadata
891       TEST_UTIL.getHBaseAdmin().disableTable(table);
892       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
893           Bytes.toBytes("B"), true, true, false, true);
894       TEST_UTIL.getHBaseAdmin().enableTable(table);
895 
896       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
897           Bytes.toBytes("A2"), Bytes.toBytes("B"));
898       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
899       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
900           .waitForAssignment(hriOverlap);
901       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
902       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
903 
904       HBaseFsck hbck = doFsck(conf, false);
905       assertErrors(hbck, new ERROR_CODE[] {
906           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
907           ERROR_CODE.HOLE_IN_REGION_CHAIN});
908 
909       // fix the problem.
910       doFsck(conf, true);
911 
912       // verify that overlaps are fixed
913       HBaseFsck hbck2 = doFsck(conf,false);
914       assertNoErrors(hbck2);
915       assertEquals(0, hbck2.getOverlapGroups(table).size());
916       assertEquals(ROWKEYS.length, countRows());
917     } finally {
918        deleteTable(table);
919     }
920   }
921 
922   /**
923    * This creates and fixes a bad table where a region overlaps two regions --
924    * a start key contained in another region and its end key is contained in
925    * yet another region.
926    */
927   @Test
928   public void testCoveredStartKey() throws Exception {
929     TableName table =
930         TableName.valueOf("tableCoveredStartKey");
931     try {
932       setupTable(table);
933       assertEquals(ROWKEYS.length, countRows());
934 
935       // Mess it up by creating an overlap in the metadata
936       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
937           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
938       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
939       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
940           .waitForAssignment(hriOverlap);
941       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
942       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
943 
944       HBaseFsck hbck = doFsck(conf, false);
945       assertErrors(hbck, new ERROR_CODE[] {
946           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
947           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
948       assertEquals(3, hbck.getOverlapGroups(table).size());
949       assertEquals(ROWKEYS.length, countRows());
950 
951       // fix the problem.
952       doFsck(conf, true);
953 
954       // verify that overlaps are fixed
955       HBaseFsck hbck2 = doFsck(conf, false);
956       assertErrors(hbck2, new ERROR_CODE[0]);
957       assertEquals(0, hbck2.getOverlapGroups(table).size());
958       assertEquals(ROWKEYS.length, countRows());
959     } finally {
960       deleteTable(table);
961     }
962   }
963 
964   /**
965    * This creates and fixes a bad table with a missing region -- hole in meta
966    * and data missing in the fs.
967    */
968   @Test
969   public void testRegionHole() throws Exception {
970     TableName table =
971         TableName.valueOf("tableRegionHole");
972     try {
973       setupTable(table);
974       assertEquals(ROWKEYS.length, countRows());
975 
976       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
977       TEST_UTIL.getHBaseAdmin().disableTable(table);
978       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
979           Bytes.toBytes("C"), true, true, true);
980       TEST_UTIL.getHBaseAdmin().enableTable(table);
981 
982       HBaseFsck hbck = doFsck(conf, false);
983       assertErrors(hbck, new ERROR_CODE[] {
984           ERROR_CODE.HOLE_IN_REGION_CHAIN});
985       // holes are separate from overlap groups
986       assertEquals(0, hbck.getOverlapGroups(table).size());
987 
988       // fix hole
989       doFsck(conf, true);
990 
991       // check that hole fixed
992       assertNoErrors(doFsck(conf,false));
993       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
994     } finally {
995       deleteTable(table);
996     }
997   }
998 
999   /**
1000    * This creates and fixes a bad table with a missing region -- hole in meta
1001    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1002    */
1003   @Test
1004   public void testHDFSRegioninfoMissing() throws Exception {
1005     TableName table =
1006         TableName.valueOf("tableHDFSRegioininfoMissing");
1007     try {
1008       setupTable(table);
1009       assertEquals(ROWKEYS.length, countRows());
1010 
1011       // Mess it up by leaving a hole in the meta data
1012       TEST_UTIL.getHBaseAdmin().disableTable(table);
1013       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1014           Bytes.toBytes("C"), true, true, false, true);
1015       TEST_UTIL.getHBaseAdmin().enableTable(table);
1016 
1017       HBaseFsck hbck = doFsck(conf, false);
1018       assertErrors(hbck, new ERROR_CODE[] {
1019           ERROR_CODE.ORPHAN_HDFS_REGION,
1020           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1021           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1022       // holes are separate from overlap groups
1023       assertEquals(0, hbck.getOverlapGroups(table).size());
1024 
1025       // fix hole
1026       doFsck(conf, true);
1027 
1028       // check that hole fixed
1029       assertNoErrors(doFsck(conf, false));
1030       assertEquals(ROWKEYS.length, countRows());
1031     } finally {
1032       deleteTable(table);
1033     }
1034   }
1035 
1036   /**
1037    * This creates and fixes a bad table with a region that is missing meta and
1038    * not assigned to a region server.
1039    */
1040   @Test
1041   public void testNotInMetaOrDeployedHole() throws Exception {
1042     TableName table =
1043         TableName.valueOf("tableNotInMetaOrDeployedHole");
1044     try {
1045       setupTable(table);
1046       assertEquals(ROWKEYS.length, countRows());
1047 
1048       // Mess it up by leaving a hole in the meta data
1049       TEST_UTIL.getHBaseAdmin().disableTable(table);
1050       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1051           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1052       TEST_UTIL.getHBaseAdmin().enableTable(table);
1053 
1054       HBaseFsck hbck = doFsck(conf, false);
1055       assertErrors(hbck, new ERROR_CODE[] {
1056           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1057       // holes are separate from overlap groups
1058       assertEquals(0, hbck.getOverlapGroups(table).size());
1059 
1060       // fix hole
1061       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1062           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1063 
1064       // check that hole fixed
1065       assertNoErrors(doFsck(conf,false));
1066       assertEquals(ROWKEYS.length, countRows());
1067     } finally {
1068       deleteTable(table);
1069     }
1070   }
1071 
1072   /**
1073    * This creates fixes a bad table with a hole in meta.
1074    */
1075   @Test
1076   public void testNotInMetaHole() throws Exception {
1077     TableName table =
1078         TableName.valueOf("tableNotInMetaHole");
1079     try {
1080       setupTable(table);
1081       assertEquals(ROWKEYS.length, countRows());
1082 
1083       // Mess it up by leaving a hole in the meta data
1084       TEST_UTIL.getHBaseAdmin().disableTable(table);
1085       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1086           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1087       TEST_UTIL.getHBaseAdmin().enableTable(table);
1088 
1089       HBaseFsck hbck = doFsck(conf, false);
1090       assertErrors(hbck, new ERROR_CODE[] {
1091           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1092       // holes are separate from overlap groups
1093       assertEquals(0, hbck.getOverlapGroups(table).size());
1094 
1095       // fix hole
1096       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1097           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1098 
1099       // check that hole fixed
1100       assertNoErrors(doFsck(conf,false));
1101       assertEquals(ROWKEYS.length, countRows());
1102     } finally {
1103       deleteTable(table);
1104     }
1105   }
1106 
1107   /**
1108    * This creates and fixes a bad table with a region that is in meta but has
1109    * no deployment or data hdfs
1110    */
1111   @Test
1112   public void testNotInHdfs() throws Exception {
1113     TableName table =
1114         TableName.valueOf("tableNotInHdfs");
1115     try {
1116       setupTable(table);
1117       assertEquals(ROWKEYS.length, countRows());
1118 
1119       // make sure data in regions, if in hlog only there is no data loss
1120       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1121 
1122       // Mess it up by leaving a hole in the hdfs data
1123       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1124           Bytes.toBytes("C"), false, false, true); // don't rm meta
1125 
1126       HBaseFsck hbck = doFsck(conf, false);
1127       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1128       // holes are separate from overlap groups
1129       assertEquals(0, hbck.getOverlapGroups(table).size());
1130 
1131       // fix hole
1132       doFsck(conf, true);
1133 
1134       // check that hole fixed
1135       assertNoErrors(doFsck(conf,false));
1136       assertEquals(ROWKEYS.length - 2, countRows());
1137     } finally {
1138       deleteTable(table);
1139     }
1140   }
1141 
1142   /**
1143    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1144    * remove the table.
1145    */
1146   @Test
1147   public void testNoHdfsTable() throws Exception {
1148     TableName table = TableName.valueOf("NoHdfsTable");
1149     setupTable(table);
1150     assertEquals(ROWKEYS.length, countRows());
1151 
1152     // make sure data in regions, if in hlog only there is no data loss
1153     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1154 
1155     // Mess it up by deleting hdfs dirs
1156     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1157         Bytes.toBytes("A"), false, false, true); // don't rm meta
1158     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1159         Bytes.toBytes("B"), false, false, true); // don't rm meta
1160     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1161         Bytes.toBytes("C"), false, false, true); // don't rm meta
1162     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1163         Bytes.toBytes(""), false, false, true); // don't rm meta
1164 
1165     // also remove the table directory in hdfs
1166     deleteTableDir(table);
1167 
1168     HBaseFsck hbck = doFsck(conf, false);
1169     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1170         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1171         ERROR_CODE.NOT_IN_HDFS,});
1172     // holes are separate from overlap groups
1173     assertEquals(0, hbck.getOverlapGroups(table).size());
1174 
1175     // fix hole
1176     doFsck(conf, true); // detect dangling regions and remove those
1177 
1178     // check that hole fixed
1179     assertNoErrors(doFsck(conf,false));
1180     assertFalse("Table "+ table + " should have been deleted",
1181         TEST_UTIL.getHBaseAdmin().tableExists(table));
1182   }
1183 
1184   public void deleteTableDir(TableName table) throws IOException {
1185     Path rootDir = FSUtils.getRootDir(conf);
1186     FileSystem fs = rootDir.getFileSystem(conf);
1187     Path p = FSUtils.getTableDir(rootDir, table);
1188     HBaseFsck.debugLsr(conf, p);
1189     boolean success = fs.delete(p, true);
1190     LOG.info("Deleted " + p + " sucessfully? " + success);
1191   }
1192 
1193   /**
1194    * when the hbase.version file missing, It is fix the fault.
1195    */
1196   @Test
1197   public void testNoVersionFile() throws Exception {
1198     // delete the hbase.version file
1199     Path rootDir = FSUtils.getRootDir(conf);
1200     FileSystem fs = rootDir.getFileSystem(conf);
1201     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1202     fs.delete(versionFile, true);
1203 
1204     // test
1205     HBaseFsck hbck = doFsck(conf, false);
1206     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1207     // fix hbase.version missing
1208     doFsck(conf, true);
1209 
1210     // no version file fixed
1211     assertNoErrors(doFsck(conf, false));
1212   }
1213 
1214   /**
1215    * The region is not deployed when the table is disabled.
1216    */
1217   @Test
1218   public void testRegionShouldNotBeDeployed() throws Exception {
1219     TableName table =
1220         TableName.valueOf("tableRegionShouldNotBeDeployed");
1221     try {
1222       LOG.info("Starting testRegionShouldNotBeDeployed.");
1223       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1224       assertTrue(cluster.waitForActiveAndReadyMaster());
1225 
1226 
1227       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1228           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1229       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1230       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1231 
1232       // Write the .tableinfo
1233       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1234       fstd.createTableDescriptor(htdDisabled);
1235       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1236           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1237 
1238       // Let's just assign everything to first RS
1239       HRegionServer hrs = cluster.getRegionServer(0);
1240 
1241       // Create region files.
1242       TEST_UTIL.getHBaseAdmin().disableTable(table);
1243       TEST_UTIL.getHBaseAdmin().enableTable(table);
1244 
1245       // Disable the table and close its regions
1246       TEST_UTIL.getHBaseAdmin().disableTable(table);
1247       HRegionInfo region = disabledRegions.remove(0);
1248       byte[] regionName = region.getRegionName();
1249 
1250       // The region should not be assigned currently
1251       assertTrue(cluster.getServerWith(regionName) == -1);
1252 
1253       // Directly open a region on a region server.
1254       // If going through AM/ZK, the region won't be open.
1255       // Even it is opened, AM will close it which causes
1256       // flakiness of this test.
1257       HRegion r = HRegion.openHRegion(
1258         region, htdDisabled, hrs.getWAL(region), conf);
1259       hrs.addToOnlineRegions(r);
1260 
1261       HBaseFsck hbck = doFsck(conf, false);
1262       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1263 
1264       // fix this fault
1265       doFsck(conf, true);
1266 
1267       // check result
1268       assertNoErrors(doFsck(conf, false));
1269     } finally {
1270       TEST_UTIL.getHBaseAdmin().enableTable(table);
1271       deleteTable(table);
1272     }
1273   }
1274 
1275   /**
1276    * This creates two tables and mess both of them and fix them one by one
1277    */
1278   @Test
1279   public void testFixByTable() throws Exception {
1280     TableName table1 =
1281         TableName.valueOf("testFixByTable1");
1282     TableName table2 =
1283         TableName.valueOf("testFixByTable2");
1284     try {
1285       setupTable(table1);
1286       // make sure data in regions, if in hlog only there is no data loss
1287       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1288       // Mess them up by leaving a hole in the hdfs data
1289       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1290         Bytes.toBytes("C"), false, false, true); // don't rm meta
1291 
1292       setupTable(table2);
1293       // make sure data in regions, if in hlog only there is no data loss
1294       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1295       // Mess them up by leaving a hole in the hdfs data
1296       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1297         Bytes.toBytes("C"), false, false, true); // don't rm meta
1298 
1299       HBaseFsck hbck = doFsck(conf, false);
1300       assertErrors(hbck, new ERROR_CODE[] {
1301         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1302 
1303       // fix hole in table 1
1304       doFsck(conf, true, table1);
1305       // check that hole in table 1 fixed
1306       assertNoErrors(doFsck(conf, false, table1));
1307       // check that hole in table 2 still there
1308       assertErrors(doFsck(conf, false, table2),
1309         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1310 
1311       // fix hole in table 2
1312       doFsck(conf, true, table2);
1313       // check that hole in both tables fixed
1314       assertNoErrors(doFsck(conf, false));
1315       assertEquals(ROWKEYS.length - 2, countRows());
1316     } finally {
1317       deleteTable(table1);
1318       deleteTable(table2);
1319     }
1320   }
1321   /**
1322    * A split parent in meta, in hdfs, and not deployed
1323    */
1324   @Test
1325   public void testLingeringSplitParent() throws Exception {
1326     TableName table =
1327         TableName.valueOf("testLingeringSplitParent");
1328     HTable meta = null;
1329     try {
1330       setupTable(table);
1331       assertEquals(ROWKEYS.length, countRows());
1332 
1333       // make sure data in regions, if in hlog only there is no data loss
1334       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1335       HRegionLocation location = tbl.getRegionLocation("B");
1336 
1337       // Delete one region from meta, but not hdfs, unassign it.
1338       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1339         Bytes.toBytes("C"), true, true, false);
1340 
1341       // Create a new meta entry to fake it as a split parent.
1342       meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
1343       HRegionInfo hri = location.getRegionInfo();
1344 
1345       HRegionInfo a = new HRegionInfo(tbl.getName(),
1346         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1347       HRegionInfo b = new HRegionInfo(tbl.getName(),
1348         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1349 
1350       hri.setOffline(true);
1351       hri.setSplit(true);
1352 
1353       MetaEditor.addRegionToMeta(meta, hri, a, b);
1354       meta.flushCommits();
1355       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1356 
1357       HBaseFsck hbck = doFsck(conf, false);
1358       assertErrors(hbck, new ERROR_CODE[] {
1359         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1360 
1361       // regular repair cannot fix lingering split parent
1362       hbck = doFsck(conf, true);
1363       assertErrors(hbck, new ERROR_CODE[] {
1364         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1365       assertFalse(hbck.shouldRerun());
1366       hbck = doFsck(conf, false);
1367       assertErrors(hbck, new ERROR_CODE[] {
1368         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1369 
1370       // fix lingering split parent
1371       hbck = new HBaseFsck(conf);
1372       hbck.connect();
1373       hbck.setDisplayFullReport(); // i.e. -details
1374       hbck.setTimeLag(0);
1375       hbck.setFixSplitParents(true);
1376       hbck.onlineHbck();
1377       assertTrue(hbck.shouldRerun());
1378 
1379       Get get = new Get(hri.getRegionName());
1380       Result result = meta.get(get);
1381       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1382         HConstants.SPLITA_QUALIFIER).isEmpty());
1383       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1384         HConstants.SPLITB_QUALIFIER).isEmpty());
1385       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1386 
1387       // fix other issues
1388       doFsck(conf, true);
1389 
1390       // check that all are fixed
1391       assertNoErrors(doFsck(conf, false));
1392       assertEquals(ROWKEYS.length, countRows());
1393     } finally {
1394       deleteTable(table);
1395       IOUtils.closeQuietly(meta);
1396     }
1397   }
1398 
1399   /**
1400    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1401    * valid cases where the daughters are there.
1402    */
1403   @Test
1404   public void testValidLingeringSplitParent() throws Exception {
1405     TableName table =
1406         TableName.valueOf("testLingeringSplitParent");
1407     HTable meta = null;
1408     try {
1409       setupTable(table);
1410       assertEquals(ROWKEYS.length, countRows());
1411 
1412       // make sure data in regions, if in hlog only there is no data loss
1413       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1414       HRegionLocation location = tbl.getRegionLocation("B");
1415 
1416       meta = new HTable(conf, TableName.META_TABLE_NAME);
1417       HRegionInfo hri = location.getRegionInfo();
1418 
1419       // do a regular split
1420       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1421       byte[] regionName = location.getRegionInfo().getRegionName();
1422       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1423       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1424           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1425 
1426       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1427       // for some time until children references are deleted. HBCK erroneously sees this as
1428       // overlapping regions
1429       HBaseFsck hbck = doFsck(
1430         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1431       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1432 
1433       // assert that the split hbase:meta entry is still there.
1434       Get get = new Get(hri.getRegionName());
1435       Result result = meta.get(get);
1436       assertNotNull(result);
1437       assertNotNull(HRegionInfo.getHRegionInfo(result));
1438 
1439       assertEquals(ROWKEYS.length, countRows());
1440 
1441       // assert that we still have the split regions
1442       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1443       assertNoErrors(doFsck(conf, false));
1444     } finally {
1445       deleteTable(table);
1446       IOUtils.closeQuietly(meta);
1447     }
1448   }
1449 
1450   /**
1451    * Split crashed after write to hbase:meta finished for the parent region, but
1452    * failed to write daughters (pre HBASE-7721 codebase)
1453    */
1454   @Test(timeout=75000)
1455   public void testSplitDaughtersNotInMeta() throws Exception {
1456     TableName table =
1457         TableName.valueOf("testSplitdaughtersNotInMeta");
1458     HTable meta = null;
1459     try {
1460       setupTable(table);
1461       assertEquals(ROWKEYS.length, countRows());
1462 
1463       // make sure data in regions, if in hlog only there is no data loss
1464       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1465       HRegionLocation location = tbl.getRegionLocation("B");
1466 
1467       meta = new HTable(conf, TableName.META_TABLE_NAME);
1468       HRegionInfo hri = location.getRegionInfo();
1469 
1470       // do a regular split
1471       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1472       byte[] regionName = location.getRegionInfo().getRegionName();
1473       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1474       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1475           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1476 
1477       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1478 
1479       // Delete daughter regions from meta, but not hdfs, unassign it.
1480       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1481       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1482       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1483 
1484       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1485       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1486       meta.flushCommits();
1487 
1488       HBaseFsck hbck = doFsck(conf, false);
1489       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1490           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1491 
1492       // now fix it. The fix should not revert the region split, but add daughters to META
1493       hbck = doFsck(
1494         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1495       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1496           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1497 
1498       // assert that the split hbase:meta entry is still there.
1499       Get get = new Get(hri.getRegionName());
1500       Result result = meta.get(get);
1501       assertNotNull(result);
1502       assertNotNull(HRegionInfo.getHRegionInfo(result));
1503 
1504       assertEquals(ROWKEYS.length, countRows());
1505 
1506       // assert that we still have the split regions
1507       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1508       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1509     } finally {
1510       deleteTable(table);
1511       IOUtils.closeQuietly(meta);
1512     }
1513   }
1514 
1515   /**
1516    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1517    * meta and data missing in the fs.
1518    */
1519   @Test(timeout=120000)
1520   public void testMissingFirstRegion() throws Exception {
1521     TableName table =
1522         TableName.valueOf("testMissingFirstRegion");
1523     try {
1524       setupTable(table);
1525       assertEquals(ROWKEYS.length, countRows());
1526 
1527       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1528       TEST_UTIL.getHBaseAdmin().disableTable(table);
1529       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1530           true, true);
1531       TEST_UTIL.getHBaseAdmin().enableTable(table);
1532 
1533       HBaseFsck hbck = doFsck(conf, false);
1534       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1535       // fix hole
1536       doFsck(conf, true);
1537       // check that hole fixed
1538       assertNoErrors(doFsck(conf, false));
1539     } finally {
1540       deleteTable(table);
1541     }
1542   }
1543 
1544   /**
1545    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1546    * meta and data missing in the fs.
1547    */
1548   @Test(timeout=120000)
1549   public void testRegionDeployedNotInHdfs() throws Exception {
1550     TableName table =
1551         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1552     try {
1553       setupTable(table);
1554       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1555 
1556       // Mess it up by deleting region dir
1557       deleteRegion(conf, tbl.getTableDescriptor(),
1558         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1559         false, true);
1560 
1561       HBaseFsck hbck = doFsck(conf, false);
1562       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1563       // fix hole
1564       doFsck(conf, true);
1565       // check that hole fixed
1566       assertNoErrors(doFsck(conf, false));
1567     } finally {
1568       deleteTable(table);
1569     }
1570   }
1571 
1572   /**
1573    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1574    * the fs.
1575    */
1576   @Test(timeout=120000)
1577   public void testMissingLastRegion() throws Exception {
1578     TableName table =
1579         TableName.valueOf("testMissingLastRegion");
1580     try {
1581       setupTable(table);
1582       assertEquals(ROWKEYS.length, countRows());
1583 
1584       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1585       TEST_UTIL.getHBaseAdmin().disableTable(table);
1586       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1587           true, true);
1588       TEST_UTIL.getHBaseAdmin().enableTable(table);
1589 
1590       HBaseFsck hbck = doFsck(conf, false);
1591       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1592       // fix hole
1593       doFsck(conf, true);
1594       // check that hole fixed
1595       assertNoErrors(doFsck(conf, false));
1596     } finally {
1597       deleteTable(table);
1598     }
1599   }
1600 
1601   /**
1602    * Test -noHdfsChecking option can detect and fix assignments issue.
1603    */
1604   @Test
1605   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1606     TableName table =
1607         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1608     try {
1609       setupTable(table);
1610       assertEquals(ROWKEYS.length, countRows());
1611 
1612       // Mess it up by closing a region
1613       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1614         Bytes.toBytes("B"), true, false, false, false);
1615 
1616       // verify there is no other errors
1617       HBaseFsck hbck = doFsck(conf, false);
1618       assertErrors(hbck, new ERROR_CODE[] {
1619         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1620 
1621       // verify that noHdfsChecking report the same errors
1622       HBaseFsck fsck = new HBaseFsck(conf);
1623       fsck.connect();
1624       fsck.setDisplayFullReport(); // i.e. -details
1625       fsck.setTimeLag(0);
1626       fsck.setCheckHdfs(false);
1627       fsck.onlineHbck();
1628       assertErrors(fsck, new ERROR_CODE[] {
1629         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1630 
1631       // verify that fixAssignments works fine with noHdfsChecking
1632       fsck = new HBaseFsck(conf);
1633       fsck.connect();
1634       fsck.setDisplayFullReport(); // i.e. -details
1635       fsck.setTimeLag(0);
1636       fsck.setCheckHdfs(false);
1637       fsck.setFixAssignments(true);
1638       fsck.onlineHbck();
1639       assertTrue(fsck.shouldRerun());
1640       fsck.onlineHbck();
1641       assertNoErrors(fsck);
1642 
1643       assertEquals(ROWKEYS.length, countRows());
1644     } finally {
1645       deleteTable(table);
1646     }
1647   }
1648 
1649   /**
1650    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1651    * However, it can not fix it without checking Hdfs because we need to get
1652    * the region info from Hdfs in this case, then to patch the meta.
1653    */
1654   @Test
1655   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1656     TableName table =
1657         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1658     try {
1659       setupTable(table);
1660       assertEquals(ROWKEYS.length, countRows());
1661 
1662       // Mess it up by deleting a region from the metadata
1663       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1664         Bytes.toBytes("B"), false, true, false, false);
1665 
1666       // verify there is no other errors
1667       HBaseFsck hbck = doFsck(conf, false);
1668       assertErrors(hbck, new ERROR_CODE[] {
1669         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1670 
1671       // verify that noHdfsChecking report the same errors
1672       HBaseFsck fsck = new HBaseFsck(conf);
1673       fsck.connect();
1674       fsck.setDisplayFullReport(); // i.e. -details
1675       fsck.setTimeLag(0);
1676       fsck.setCheckHdfs(false);
1677       fsck.onlineHbck();
1678       assertErrors(fsck, new ERROR_CODE[] {
1679         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1680 
1681       // verify that fixMeta doesn't work with noHdfsChecking
1682       fsck = new HBaseFsck(conf);
1683       fsck.connect();
1684       fsck.setDisplayFullReport(); // i.e. -details
1685       fsck.setTimeLag(0);
1686       fsck.setCheckHdfs(false);
1687       fsck.setFixAssignments(true);
1688       fsck.setFixMeta(true);
1689       fsck.onlineHbck();
1690       assertFalse(fsck.shouldRerun());
1691       assertErrors(fsck, new ERROR_CODE[] {
1692         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1693 
1694       // fix the cluster so other tests won't be impacted
1695       fsck = doFsck(conf, true);
1696       assertTrue(fsck.shouldRerun());
1697       fsck = doFsck(conf, true);
1698       assertNoErrors(fsck);
1699     } finally {
1700       deleteTable(table);
1701     }
1702   }
1703 
1704   /**
1705    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1706    * and -noHdfsChecking can't detect orphan Hdfs region.
1707    */
1708   @Test
1709   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1710     TableName table =
1711         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1712     try {
1713       setupTable(table);
1714       assertEquals(ROWKEYS.length, countRows());
1715 
1716       // Mess it up by creating an overlap in the metadata
1717       TEST_UTIL.getHBaseAdmin().disableTable(table);
1718       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1719         Bytes.toBytes("B"), true, true, false, true);
1720       TEST_UTIL.getHBaseAdmin().enableTable(table);
1721 
1722       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1723         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1724       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1725       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1726         .waitForAssignment(hriOverlap);
1727       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1728       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1729 
1730       HBaseFsck hbck = doFsck(conf, false);
1731       assertErrors(hbck, new ERROR_CODE[] {
1732         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1733         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1734 
1735       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1736       HBaseFsck fsck = new HBaseFsck(conf);
1737       fsck.connect();
1738       fsck.setDisplayFullReport(); // i.e. -details
1739       fsck.setTimeLag(0);
1740       fsck.setCheckHdfs(false);
1741       fsck.onlineHbck();
1742       assertErrors(fsck, new ERROR_CODE[] {
1743         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1744 
1745       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1746       fsck = new HBaseFsck(conf);
1747       fsck.connect();
1748       fsck.setDisplayFullReport(); // i.e. -details
1749       fsck.setTimeLag(0);
1750       fsck.setCheckHdfs(false);
1751       fsck.setFixHdfsHoles(true);
1752       fsck.setFixHdfsOverlaps(true);
1753       fsck.setFixHdfsOrphans(true);
1754       fsck.onlineHbck();
1755       assertFalse(fsck.shouldRerun());
1756       assertErrors(fsck, new ERROR_CODE[] {
1757         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1758     } finally {
1759       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1760         TEST_UTIL.getHBaseAdmin().enableTable(table);
1761       }
1762       deleteTable(table);
1763     }
1764   }
1765 
1766   /**
1767    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1768    * legitimate hfile and return it.
1769    * @param fs
1770    * @param table
1771    * @return Path of a flushed hfile.
1772    * @throws IOException
1773    */
1774   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1775     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1776     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1777     Path famDir = new Path(regionDir, FAM_STR);
1778 
1779     // keep doing this until we get a legit hfile
1780     while (true) {
1781       FileStatus[] hfFss = fs.listStatus(famDir);
1782       if (hfFss.length == 0) {
1783         continue;
1784       }
1785       for (FileStatus hfs : hfFss) {
1786         if (!hfs.isDir()) {
1787           return hfs.getPath();
1788         }
1789       }
1790     }
1791   }
1792 
1793   /**
1794    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1795    */
1796   @Test(timeout=180000)
1797   public void testQuarantineCorruptHFile() throws Exception {
1798     TableName table = TableName.valueOf(name.getMethodName());
1799     try {
1800       setupTable(table);
1801       assertEquals(ROWKEYS.length, countRows());
1802       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1803 
1804       FileSystem fs = FileSystem.get(conf);
1805       Path hfile = getFlushedHFile(fs, table);
1806 
1807       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1808       TEST_UTIL.getHBaseAdmin().disableTable(table);
1809 
1810       // create new corrupt file called deadbeef (valid hfile name)
1811       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1812       TestHFile.truncateFile(fs, hfile, corrupt);
1813       LOG.info("Created corrupted file " + corrupt);
1814       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1815 
1816       // we cannot enable here because enable never finished due to the corrupt region.
1817       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1818       assertEquals(res.getRetCode(), 0);
1819       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1820       assertEquals(hfcc.getHFilesChecked(), 5);
1821       assertEquals(hfcc.getCorrupted().size(), 1);
1822       assertEquals(hfcc.getFailures().size(), 0);
1823       assertEquals(hfcc.getQuarantined().size(), 1);
1824       assertEquals(hfcc.getMissing().size(), 0);
1825 
1826       // Its been fixed, verify that we can enable.
1827       TEST_UTIL.getHBaseAdmin().enableTable(table);
1828     } finally {
1829       deleteTable(table);
1830     }
1831   }
1832 
1833   /**
1834   * Test that use this should have a timeout, because this method could potentially wait forever.
1835   */
1836   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1837                                 int corrupt, int fail, int quar, int missing) throws Exception {
1838     try {
1839       setupTable(table);
1840       assertEquals(ROWKEYS.length, countRows());
1841       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1842 
1843       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1844       TEST_UTIL.getHBaseAdmin().disableTable(table);
1845 
1846       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1847           table.getNameAsString()};
1848       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1849       HBaseFsck res = hbck.exec(exec, args);
1850 
1851       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1852       assertEquals(hfcc.getHFilesChecked(), check);
1853       assertEquals(hfcc.getCorrupted().size(), corrupt);
1854       assertEquals(hfcc.getFailures().size(), fail);
1855       assertEquals(hfcc.getQuarantined().size(), quar);
1856       assertEquals(hfcc.getMissing().size(), missing);
1857 
1858       // its been fixed, verify that we can enable
1859       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1860       admin.enableTableAsync(table);
1861       while (!admin.isTableEnabled(table)) {
1862         try {
1863           Thread.sleep(250);
1864         } catch (InterruptedException e) {
1865           e.printStackTrace();
1866           fail("Interrupted when trying to enable table " + table);
1867         }
1868       }
1869     } finally {
1870       deleteTable(table);
1871     }
1872   }
1873 
1874   /**
1875    * This creates a table and simulates the race situation where a concurrent compaction or split
1876    * has removed an hfile after the corruption checker learned about it.
1877    */
1878   @Test(timeout=180000)
1879   public void testQuarantineMissingHFile() throws Exception {
1880     TableName table = TableName.valueOf(name.getMethodName());
1881     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1882     // inject a fault in the hfcc created.
1883     final FileSystem fs = FileSystem.get(conf);
1884     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1885       @Override
1886       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1887         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1888           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1889           @Override
1890           protected void checkHFile(Path p) throws IOException {
1891             if (attemptedFirstHFile.compareAndSet(false, true)) {
1892               assertTrue(fs.delete(p, true)); // make sure delete happened.
1893             }
1894             super.checkHFile(p);
1895           }
1896         };
1897       }
1898     };
1899     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1900   }
1901 
1902   /**
1903    * This creates a table and simulates the race situation where a concurrent compaction or split
1904    * has removed an colfam dir before the corruption checker got to it.
1905    */
1906   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1907   // files in a column family on initial creation -- as suggested by Matteo.
1908   @Ignore @Test(timeout=180000)
1909   public void testQuarantineMissingFamdir() throws Exception {
1910     TableName table = TableName.valueOf(name.getMethodName());
1911     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1912     // inject a fault in the hfcc created.
1913     final FileSystem fs = FileSystem.get(conf);
1914     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1915       @Override
1916       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1917         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1918           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1919           @Override
1920           protected void checkColFamDir(Path p) throws IOException {
1921             if (attemptedFirstHFile.compareAndSet(false, true)) {
1922               assertTrue(fs.delete(p, true)); // make sure delete happened.
1923             }
1924             super.checkColFamDir(p);
1925           }
1926         };
1927       }
1928     };
1929     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1930   }
1931 
1932   /**
1933    * This creates a table and simulates the race situation where a concurrent compaction or split
1934    * has removed a region dir before the corruption checker got to it.
1935    */
1936   @Test(timeout=180000)
1937   public void testQuarantineMissingRegionDir() throws Exception {
1938     TableName table = TableName.valueOf(name.getMethodName());
1939     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1940     // inject a fault in the hfcc created.
1941     final FileSystem fs = FileSystem.get(conf);
1942     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1943       @Override
1944       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1945         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1946           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1947           @Override
1948           protected void checkRegionDir(Path p) throws IOException {
1949             if (attemptedFirstHFile.compareAndSet(false, true)) {
1950               assertTrue(fs.delete(p, true)); // make sure delete happened.
1951             }
1952             super.checkRegionDir(p);
1953           }
1954         };
1955       }
1956     };
1957     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1958   }
1959 
1960   /**
1961    * Test fixing lingering reference file.
1962    */
1963   @Test
1964   public void testLingeringReferenceFile() throws Exception {
1965     TableName table =
1966         TableName.valueOf("testLingeringReferenceFile");
1967     try {
1968       setupTable(table);
1969       assertEquals(ROWKEYS.length, countRows());
1970 
1971       // Mess it up by creating a fake reference file
1972       FileSystem fs = FileSystem.get(conf);
1973       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1974       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1975       Path famDir = new Path(regionDir, FAM_STR);
1976       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1977       fs.create(fakeReferenceFile);
1978 
1979       HBaseFsck hbck = doFsck(conf, false);
1980       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1981       // fix reference file
1982       doFsck(conf, true);
1983       // check that reference file fixed
1984       assertNoErrors(doFsck(conf, false));
1985     } finally {
1986       deleteTable(table);
1987     }
1988   }
1989 
1990   /**
1991    * Test mission REGIONINFO_QUALIFIER in hbase:meta
1992    */
1993   @Test
1994   public void testMissingRegionInfoQualifier() throws Exception {
1995     TableName table =
1996         TableName.valueOf("testMissingRegionInfoQualifier");
1997     try {
1998       setupTable(table);
1999 
2000       // Mess it up by removing the RegionInfo for one region.
2001       final List<Delete> deletes = new LinkedList<Delete>();
2002       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
2003       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
2004 
2005         @Override
2006         public boolean processRow(Result rowResult) throws IOException {
2007           HRegionInfo hri = MetaScanner.getHRegionInfo(rowResult);
2008           if (hri != null && !hri.getTable().isSystemTable()) {
2009             Delete delete = new Delete(rowResult.getRow());
2010             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2011             deletes.add(delete);
2012           }
2013           return true;
2014         }
2015 
2016         @Override
2017         public void close() throws IOException {
2018         }
2019       });
2020       meta.delete(deletes);
2021 
2022       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2023       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2024         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2025       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2026         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2027       meta.close();
2028 
2029       HBaseFsck hbck = doFsck(conf, false);
2030       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2031 
2032       // fix reference file
2033       hbck = doFsck(conf, true);
2034 
2035       // check that reference file fixed
2036       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2037     } finally {
2038       deleteTable(table);
2039     }
2040   }
2041 
2042 
2043   /**
2044    * Test pluggable error reporter. It can be plugged in
2045    * from system property or configuration.
2046    */
2047   @Test
2048   public void testErrorReporter() throws Exception {
2049     try {
2050       MockErrorReporter.calledCount = 0;
2051       doFsck(conf, false);
2052       assertEquals(MockErrorReporter.calledCount, 0);
2053 
2054       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2055       doFsck(conf, false);
2056       assertTrue(MockErrorReporter.calledCount > 20);
2057     } finally {
2058       conf.set("hbasefsck.errorreporter",
2059         PrintingErrorReporter.class.getName());
2060       MockErrorReporter.calledCount = 0;
2061     }
2062   }
2063 
2064   static class MockErrorReporter implements ErrorReporter {
2065     static int calledCount = 0;
2066 
2067     @Override
2068     public void clear() {
2069       calledCount++;
2070     }
2071 
2072     @Override
2073     public void report(String message) {
2074       calledCount++;
2075     }
2076 
2077     @Override
2078     public void reportError(String message) {
2079       calledCount++;
2080     }
2081 
2082     @Override
2083     public void reportError(ERROR_CODE errorCode, String message) {
2084       calledCount++;
2085     }
2086 
2087     @Override
2088     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2089       calledCount++;
2090     }
2091 
2092     @Override
2093     public void reportError(ERROR_CODE errorCode,
2094         String message, TableInfo table, HbckInfo info) {
2095       calledCount++;
2096     }
2097 
2098     @Override
2099     public void reportError(ERROR_CODE errorCode, String message,
2100         TableInfo table, HbckInfo info1, HbckInfo info2) {
2101       calledCount++;
2102     }
2103 
2104     @Override
2105     public int summarize() {
2106       return ++calledCount;
2107     }
2108 
2109     @Override
2110     public void detail(String details) {
2111       calledCount++;
2112     }
2113 
2114     @Override
2115     public ArrayList<ERROR_CODE> getErrorList() {
2116       calledCount++;
2117       return new ArrayList<ERROR_CODE>();
2118     }
2119 
2120     @Override
2121     public void progress() {
2122       calledCount++;
2123     }
2124 
2125     @Override
2126     public void print(String message) {
2127       calledCount++;
2128     }
2129 
2130     @Override
2131     public void resetErrors() {
2132       calledCount++;
2133     }
2134 
2135     @Override
2136     public boolean tableHasErrors(TableInfo table) {
2137       calledCount++;
2138       return false;
2139     }
2140   }
2141 
2142   @Test(timeout=180000)
2143   public void testCheckTableLocks() throws Exception {
2144     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2145     EnvironmentEdgeManager.injectEdge(edge);
2146     // check no errors
2147     HBaseFsck hbck = doFsck(conf, false);
2148     assertNoErrors(hbck);
2149 
2150     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2151 
2152     // obtain one lock
2153     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2154     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2155         "testCheckTableLocks");
2156     writeLock.acquire();
2157     hbck = doFsck(conf, false);
2158     assertNoErrors(hbck); // should not have expired, no problems
2159 
2160     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2161         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2162 
2163     hbck = doFsck(conf, false);
2164     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2165 
2166     final CountDownLatch latch = new CountDownLatch(1);
2167     new Thread() {
2168       @Override
2169       public void run() {
2170         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2171             "testCheckTableLocks");
2172         try {
2173           latch.countDown();
2174           readLock.acquire();
2175         } catch (IOException ex) {
2176           fail();
2177         } catch (IllegalStateException ex) {
2178           return; // expected, since this will be reaped under us.
2179         }
2180         fail("should not have come here");
2181       };
2182     }.start();
2183 
2184     latch.await(); // wait until thread starts
2185     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2186 
2187     hbck = doFsck(conf, false);
2188     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2189 
2190     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2191         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2192 
2193     hbck = doFsck(conf, false);
2194     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2195 
2196     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2197                                                                  // which is not injectable through EnvironmentEdge
2198     Threads.sleep(10);
2199     hbck = doFsck(conf, true); // now fix both cases
2200 
2201     hbck = doFsck(conf, false);
2202     assertNoErrors(hbck);
2203 
2204     // ensure that locks are deleted
2205     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2206         "should acquire without blocking");
2207     writeLock.acquire(); // this should not block.
2208     writeLock.release(); // release for clean state
2209   }
2210 
2211   /**
2212    * Test orphaned table ZNode (for table states)
2213    */
2214   @Test
2215   public void testOrphanedTableZNode() throws Exception {
2216     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2217 
2218     try {
2219       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getZKTable().
2220       setEnablingTable(table);
2221 
2222       try {
2223         setupTable(table);
2224         Assert.fail(
2225           "Create table should fail when its ZNode has already existed with ENABLING state.");
2226       } catch(TableExistsException t) {
2227         //Expected exception
2228       }
2229       // The setup table was interrupted in some state that needs to some cleanup.
2230       try {
2231         deleteTable(table);
2232       } catch (IOException e) {
2233         // Because create table failed, it is expected that the cleanup table would
2234         // throw some exception.  Ignore and continue.
2235       }
2236 
2237       HBaseFsck hbck = doFsck(conf, false);
2238       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2239 
2240       // fix the orphaned ZK entry
2241       hbck = doFsck(conf, true);
2242 
2243       // check that orpahned ZK table entry is gone.
2244       hbck = doFsck(conf, false);
2245       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2246       // Now create table should succeed.
2247       setupTable(table);
2248     } finally {
2249       // This code could be called that either a table was created successfully or set up
2250       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2251       try {
2252         deleteTable(table);
2253       } catch (IOException e) {
2254         // The cleanup table would throw some exception if create table failed in some state.
2255         // Ignore this exception
2256       }
2257     }
2258   }
2259 
2260   @Test
2261   public void testMetaOffline() throws Exception {
2262     // check no errors
2263     HBaseFsck hbck = doFsck(conf, false);
2264     assertNoErrors(hbck);
2265     deleteMetaRegion(conf, true, false, false);
2266     hbck = doFsck(conf, false);
2267     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2268     // inconsistency and whether we will be fixing it or not.
2269     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2270     hbck = doFsck(conf, true);
2271     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2272     hbck = doFsck(conf, false);
2273     assertNoErrors(hbck);
2274   }
2275 
2276   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2277       boolean regionInfoOnly) throws IOException, InterruptedException {
2278     HConnection connection = HConnectionManager.getConnection(conf);
2279     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2280         HConstants.EMPTY_START_ROW);
2281     ServerName hsa = metaLocation.getServerName();
2282     HRegionInfo hri = metaLocation.getRegionInfo();
2283     if (unassign) {
2284       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2285       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2286     }
2287 
2288     if (regionInfoOnly) {
2289       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2290       Path rootDir = FSUtils.getRootDir(conf);
2291       FileSystem fs = rootDir.getFileSystem(conf);
2292       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2293           hri.getEncodedName());
2294       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2295       fs.delete(hriPath, true);
2296     }
2297 
2298     if (hdfs) {
2299       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2300       Path rootDir = FSUtils.getRootDir(conf);
2301       FileSystem fs = rootDir.getFileSystem(conf);
2302       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2303           hri.getEncodedName());
2304       HBaseFsck.debugLsr(conf, p);
2305       boolean success = fs.delete(p, true);
2306       LOG.info("Deleted " + p + " sucessfully? " + success);
2307       HBaseFsck.debugLsr(conf, p);
2308     }
2309   }
2310 
2311   @Test
2312   public void testTableWithNoRegions() throws Exception {
2313     // We might end up with empty regions in a table
2314     // see also testNoHdfsTable()
2315     TableName table =
2316         TableName.valueOf(name.getMethodName());
2317     try {
2318       // create table with one region
2319       HTableDescriptor desc = new HTableDescriptor(table);
2320       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2321       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2322       TEST_UTIL.getHBaseAdmin().createTable(desc);
2323       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2324 
2325       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2326       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2327           false, true);
2328 
2329       HBaseFsck hbck = doFsck(conf, false);
2330       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2331 
2332       doFsck(conf, true);
2333 
2334       // fix hole
2335       doFsck(conf, true);
2336 
2337       // check that hole fixed
2338       assertNoErrors(doFsck(conf, false));
2339     } finally {
2340       deleteTable(table);
2341     }
2342 
2343   }
2344 
2345   @Test
2346   public void testHbckAfterRegionMerge() throws Exception {
2347     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2348     HTable meta = null;
2349     try {
2350       // disable CatalogJanitor
2351       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2352       setupTable(table);
2353       assertEquals(ROWKEYS.length, countRows());
2354 
2355       // make sure data in regions, if in hlog only there is no data loss
2356       TEST_UTIL.getHBaseAdmin().flush(table.getName());
2357       HRegionInfo region1 = tbl.getRegionLocation("A").getRegionInfo();
2358       HRegionInfo region2 = tbl.getRegionLocation("B").getRegionInfo();
2359 
2360       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2361 
2362       assertNotEquals(region1, region2);
2363 
2364       // do a region merge
2365       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
2366       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2367           region2.getEncodedNameAsBytes(), false);
2368 
2369       // wait until region merged
2370       long timeout = System.currentTimeMillis() + 30 * 1000;
2371       while (true) {
2372         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2373           break;
2374         } else if (System.currentTimeMillis() > timeout) {
2375           fail("Time out waiting on region " + region1.getEncodedName()
2376               + " and " + region2.getEncodedName() + " be merged");
2377         }
2378         Thread.sleep(10);
2379       }
2380 
2381       assertEquals(ROWKEYS.length, countRows());
2382 
2383       HBaseFsck hbck = doFsck(conf, false);
2384       assertNoErrors(hbck); // no errors
2385 
2386     } finally {
2387       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2388       deleteTable(table);
2389       IOUtils.closeQuietly(meta);
2390     }
2391   }
2392 
2393   @Test
2394   public void testRegionBoundariesCheck() throws Exception {
2395     HBaseFsck hbck = doFsck(conf, false);
2396     assertNoErrors(hbck); // no errors
2397     try {
2398       hbck.checkRegionBoundaries();
2399     } catch (IllegalArgumentException e) {
2400       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2401         fail("Table directory path is not valid." + e.getMessage());
2402       }
2403     }
2404   }
2405 
2406   @org.junit.Rule
2407   public TestName name = new TestName();
2408 
2409   @Test
2410   public void testReadOnlyProperty() throws Exception {
2411     HBaseFsck hbck = doFsck(conf, false);
2412     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2413       hbck.shouldIgnorePreCheckPermission());
2414 
2415     hbck = doFsck(conf, true);
2416     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2417       hbck.shouldIgnorePreCheckPermission());
2418 
2419     hbck = doFsck(conf, true);
2420     hbck.setIgnorePreCheckPermission(true);
2421     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2422       hbck.shouldIgnorePreCheckPermission());
2423   }
2424 
2425   @Before
2426   public void setUp() {
2427     EnvironmentEdgeManager.reset();
2428   }
2429 
2430   @Test (timeout=180000)
2431   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2432     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2433     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2434     try {
2435       HTableDescriptor desc = new HTableDescriptor(table);
2436       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2437       TEST_UTIL.getHBaseAdmin().createTable(desc);
2438       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2439       for (int i = 0; i < 5; i++) {
2440         Put p1 = new Put(("r" + i).getBytes());
2441         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2442         tbl.put(p1);
2443       }
2444       TEST_UTIL.getHBaseAdmin().flush(desc.getTableName().toString());
2445       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2446       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2447       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2448       cluster.getServerWith(regions.get(0).getRegionName());
2449       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2450       st.prepare();
2451       st.stepsBeforePONR(regionServer, regionServer, false);
2452       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2453       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2454       for (RegionState state : regionsInTransition.values()) {
2455         am.regionOffline(state.getRegion());
2456       }
2457       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2458       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2459       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2460       am.assign(regionsMap);
2461       am.waitForAssignment(regions.get(0).getRegionInfo());
2462       HBaseFsck hbck = doFsck(conf, false);
2463       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2464           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2465       // holes are separate from overlap groups
2466       assertEquals(0, hbck.getOverlapGroups(table).size());
2467 
2468       // fix hole
2469       assertErrors(
2470         doFsck(
2471           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2472         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2473           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2474 
2475       // check that hole fixed
2476       assertNoErrors(doFsck(conf, false));
2477       assertEquals(5, countRows());
2478     } finally {
2479       if (tbl != null) {
2480         tbl.close();
2481         tbl = null;
2482       }
2483       deleteTable(table);
2484     }
2485   }
2486 }