View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.Collection;
23  import java.util.LinkedList;
24  import java.util.List;
25  
26  import org.apache.commons.lang.math.RandomUtils;
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.hadoop.hbase.ClusterStatus;
30  import org.apache.hadoop.hbase.HBaseCluster;
31  import org.apache.hadoop.hbase.HRegionInfo;
32  import org.apache.hadoop.hbase.IntegrationTestingUtility;
33  import org.apache.hadoop.hbase.ServerLoad;
34  import org.apache.hadoop.hbase.ServerName;
35  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
36  import org.apache.hadoop.hbase.client.HBaseAdmin;
37  import org.apache.hadoop.hbase.util.Bytes;
38  
39  /**
40   * A (possibly mischievous) action that the ChaosMonkey can perform.
41   */
42  public class Action {
43  
44    public static final String KILL_MASTER_TIMEOUT_KEY =
45        "hbase.chaosmonkey.action.killmastertimeout";
46    public static final String START_MASTER_TIMEOUT_KEY =
47        "hbase.chaosmonkey.action.startmastertimeout";
48    public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
49    public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
50  
51    protected static Log LOG = LogFactory.getLog(Action.class);
52  
53    protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
54    protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
55    protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
56    protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
57  
58    protected ActionContext context;
59    protected HBaseCluster cluster;
60    protected ClusterStatus initialStatus;
61    protected ServerName[] initialServers;
62  
63    protected long killMasterTimeout;
64    protected long startMasterTimeout;
65    protected long killRsTimeout;
66    protected long startRsTimeout;
67  
68    public void init(ActionContext context) throws IOException {
69      this.context = context;
70      cluster = context.getHBaseCluster();
71      initialStatus = cluster.getInitialClusterStatus();
72      Collection<ServerName> regionServers = initialStatus.getServers();
73      initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
74  
75      killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
76          KILL_MASTER_TIMEOUT_DEFAULT);
77      startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
78          START_MASTER_TIMEOUT_DEFAULT);
79      killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
80      startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
81    }
82  
83    public void perform() throws Exception { }
84  
85    /** Returns current region servers */
86    protected ServerName[] getCurrentServers() throws IOException {
87      Collection<ServerName> regionServers = cluster.getClusterStatus().getServers();
88      if (regionServers == null || regionServers.size() <= 0) return new ServerName [] {};
89      return regionServers.toArray(new ServerName[regionServers.size()]);
90    }
91  
92    protected void killMaster(ServerName server) throws IOException {
93      LOG.info("Killing master:" + server);
94      cluster.killMaster(server);
95      cluster.waitForMasterToStop(server, killMasterTimeout);
96      LOG.info("Killed master server:" + server);
97    }
98  
99    protected void startMaster(ServerName server) throws IOException {
100     LOG.info("Starting master:" + server.getHostname());
101     cluster.startMaster(server.getHostname(), server.getPort());
102     cluster.waitForActiveAndReadyMaster(startMasterTimeout);
103     LOG.info("Started master: " + server);
104   }
105 
106   protected void killRs(ServerName server) throws IOException {
107     LOG.info("Killing region server:" + server);
108     cluster.killRegionServer(server);
109     cluster.waitForRegionServerToStop(server, killRsTimeout);
110     LOG.info("Killed region server:" + server + ". Reported num of rs:"
111         + cluster.getClusterStatus().getServersSize());
112   }
113 
114   protected void startRs(ServerName server) throws IOException {
115     LOG.info("Starting region server:" + server.getHostname());
116     cluster.startRegionServer(server.getHostname(), server.getPort());
117     cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
118     LOG.info("Started region server:" + server + ". Reported num of rs:"
119         + cluster.getClusterStatus().getServersSize());
120   }
121 
122   protected void unbalanceRegions(ClusterStatus clusterStatus,
123       List<ServerName> fromServers, List<ServerName> toServers,
124       double fractionOfRegions) throws Exception {
125     List<byte[]> victimRegions = new LinkedList<byte[]>();
126     for (ServerName server : fromServers) {
127       ServerLoad serverLoad = clusterStatus.getLoad(server);
128       // Ugh.
129       List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
130       int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
131       LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
132       for (int i = 0; i < victimRegionCount; ++i) {
133         int victimIx = RandomUtils.nextInt(regions.size());
134         String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
135         victimRegions.add(Bytes.toBytes(regionId));
136       }
137     }
138 
139     LOG.info("Moving " + victimRegions.size() + " regions from " + fromServers.size()
140         + " servers to " + toServers.size() + " different servers");
141     HBaseAdmin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
142     for (byte[] victimRegion : victimRegions) {
143       int targetIx = RandomUtils.nextInt(toServers.size());
144       admin.move(victimRegion, Bytes.toBytes(toServers.get(targetIx).getServerName()));
145     }
146   }
147 
148   protected void forceBalancer() throws Exception {
149     HBaseAdmin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
150     boolean result = false;
151     try {
152       result = admin.balancer();
153     } catch (Exception e) {
154       LOG.warn("Got exception while doing balance ", e);
155     }
156     if (!result) {
157       LOG.error("Balancer didn't succeed");
158     }
159   }
160 
161   /**
162    * Context for Action's
163    */
164   public static class ActionContext {
165     private IntegrationTestingUtility util;
166 
167     public ActionContext(IntegrationTestingUtility util) {
168       this.util = util;
169     }
170 
171     public IntegrationTestingUtility getHBaseIntegrationTestingUtility() {
172       return util;
173     }
174 
175     public HBaseCluster getHBaseCluster() {
176       return util.getHBaseClusterInterface();
177     }
178   }
179 }