1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import static org.junit.Assert.assertEquals;
22 import static org.junit.Assert.assertFalse;
23 import static org.junit.Assert.assertNull;
24 import static org.junit.Assert.assertTrue;
25 import static org.junit.Assert.fail;
26
27 import java.util.ArrayList;
28
29 import org.apache.hadoop.hbase.HConstants;
30 import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser;
31 import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.BadTsvLineException;
32 import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.ParsedLine;
33 import org.apache.hadoop.hbase.testclassification.SmallTests;
34 import org.apache.hadoop.hbase.util.Bytes;
35 import org.apache.hadoop.hbase.util.Pair;
36 import org.junit.Test;
37 import org.junit.experimental.categories.Category;
38
39 import com.google.common.base.Joiner;
40 import com.google.common.base.Splitter;
41 import com.google.common.collect.Iterables;
42
43
44
45
46 @Category(SmallTests.class)
47 public class TestImportTsvParser {
48
49 private void assertBytesEquals(byte[] a, byte[] b) {
50 assertEquals(Bytes.toStringBinary(a), Bytes.toStringBinary(b));
51 }
52
53 private void checkParsing(ParsedLine parsed, Iterable<String> expected) {
54 ArrayList<String> parsedCols = new ArrayList<String>();
55 for (int i = 0; i < parsed.getColumnCount(); i++) {
56 parsedCols.add(Bytes.toString(parsed.getLineBytes(), parsed.getColumnOffset(i),
57 parsed.getColumnLength(i)));
58 }
59 if (!Iterables.elementsEqual(parsedCols, expected)) {
60 fail("Expected: " + Joiner.on(",").join(expected) + "\n" + "Got:"
61 + Joiner.on(",").join(parsedCols));
62 }
63 }
64
65 @Test
66 public void testTsvParserSpecParsing() {
67 TsvParser parser;
68
69 parser = new TsvParser("HBASE_ROW_KEY", "\t");
70 assertNull(parser.getFamily(0));
71 assertNull(parser.getQualifier(0));
72 assertEquals(0, parser.getRowKeyColumnIndex());
73 assertFalse(parser.hasTimestamp());
74
75 parser = new TsvParser("HBASE_ROW_KEY,col1:scol1", "\t");
76 assertNull(parser.getFamily(0));
77 assertNull(parser.getQualifier(0));
78 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
79 assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
80 assertEquals(0, parser.getRowKeyColumnIndex());
81 assertFalse(parser.hasTimestamp());
82
83 parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,col1:scol2", "\t");
84 assertNull(parser.getFamily(0));
85 assertNull(parser.getQualifier(0));
86 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
87 assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
88 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(2));
89 assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(2));
90 assertEquals(0, parser.getRowKeyColumnIndex());
91 assertFalse(parser.hasTimestamp());
92
93 parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2", "\t");
94 assertNull(parser.getFamily(0));
95 assertNull(parser.getQualifier(0));
96 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
97 assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
98 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
99 assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
100 assertEquals(0, parser.getRowKeyColumnIndex());
101 assertTrue(parser.hasTimestamp());
102 assertEquals(2, parser.getTimestampKeyColumnIndex());
103
104 parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ATTRIBUTES_KEY",
105 "\t");
106 assertNull(parser.getFamily(0));
107 assertNull(parser.getQualifier(0));
108 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
109 assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
110 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
111 assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
112 assertEquals(0, parser.getRowKeyColumnIndex());
113 assertTrue(parser.hasTimestamp());
114 assertEquals(2, parser.getTimestampKeyColumnIndex());
115 assertEquals(4, parser.getAttributesKeyColumnIndex());
116
117 parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ROW_KEY",
118 "\t");
119 assertNull(parser.getFamily(0));
120 assertNull(parser.getQualifier(0));
121 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
122 assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
123 assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
124 assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
125 assertEquals(4, parser.getRowKeyColumnIndex());
126 assertTrue(parser.hasTimestamp());
127 assertEquals(2, parser.getTimestampKeyColumnIndex());
128 assertEquals(0, parser.getAttributesKeyColumnIndex());
129 }
130
131 @Test
132 public void testTsvParser() throws BadTsvLineException {
133 TsvParser parser = new TsvParser("col_a,col_b:qual,HBASE_ROW_KEY,col_d", "\t");
134 assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(0));
135 assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(0));
136 assertBytesEquals(Bytes.toBytes("col_b"), parser.getFamily(1));
137 assertBytesEquals(Bytes.toBytes("qual"), parser.getQualifier(1));
138 assertNull(parser.getFamily(2));
139 assertNull(parser.getQualifier(2));
140 assertEquals(2, parser.getRowKeyColumnIndex());
141
142 assertEquals(TsvParser.DEFAULT_TIMESTAMP_COLUMN_INDEX, parser.getTimestampKeyColumnIndex());
143
144 byte[] line = Bytes.toBytes("val_a\tval_b\tval_c\tval_d");
145 ParsedLine parsed = parser.parse(line, line.length);
146 checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
147 }
148
149 @Test
150 public void testTsvParserWithTimestamp() throws BadTsvLineException {
151 TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
152 assertNull(parser.getFamily(0));
153 assertNull(parser.getQualifier(0));
154 assertNull(parser.getFamily(1));
155 assertNull(parser.getQualifier(1));
156 assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(2));
157 assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(2));
158 assertEquals(0, parser.getRowKeyColumnIndex());
159 assertEquals(1, parser.getTimestampKeyColumnIndex());
160
161 byte[] line = Bytes.toBytes("rowkey\t1234\tval_a");
162 ParsedLine parsed = parser.parse(line, line.length);
163 assertEquals(1234l, parsed.getTimestamp(-1));
164 checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
165 }
166
167
168
169
170 @Test(expected = BadTsvLineException.class)
171 public void testTsvParserBadTsvLineExcessiveColumns() throws BadTsvLineException {
172 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
173 byte[] line = Bytes.toBytes("val_a\tval_b\tval_c");
174 parser.parse(line, line.length);
175 }
176
177 @Test(expected = BadTsvLineException.class)
178 public void testTsvParserBadTsvLineZeroColumn() throws BadTsvLineException {
179 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
180 byte[] line = Bytes.toBytes("");
181 parser.parse(line, line.length);
182 }
183
184 @Test(expected = BadTsvLineException.class)
185 public void testTsvParserBadTsvLineOnlyKey() throws BadTsvLineException {
186 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
187 byte[] line = Bytes.toBytes("key_only");
188 parser.parse(line, line.length);
189 }
190
191 @Test(expected = BadTsvLineException.class)
192 public void testTsvParserBadTsvLineNoRowKey() throws BadTsvLineException {
193 TsvParser parser = new TsvParser("col_a,HBASE_ROW_KEY", "\t");
194 byte[] line = Bytes.toBytes("only_cola_data_and_no_row_key");
195 parser.parse(line, line.length);
196 }
197
198 @Test(expected = BadTsvLineException.class)
199 public void testTsvParserInvalidTimestamp() throws BadTsvLineException {
200 TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
201 assertEquals(1, parser.getTimestampKeyColumnIndex());
202 byte[] line = Bytes.toBytes("rowkey\ttimestamp\tval_a");
203 ParsedLine parsed = parser.parse(line, line.length);
204 assertEquals(-1, parsed.getTimestamp(-1));
205 checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
206 }
207
208 @Test(expected = BadTsvLineException.class)
209 public void testTsvParserNoTimestampValue() throws BadTsvLineException {
210 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
211 assertEquals(2, parser.getTimestampKeyColumnIndex());
212 byte[] line = Bytes.toBytes("rowkey\tval_a");
213 parser.parse(line, line.length);
214 }
215
216 @Test
217 public void testTsvParserParseRowKey() throws BadTsvLineException {
218 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
219 assertEquals(0, parser.getRowKeyColumnIndex());
220 byte[] line = Bytes.toBytes("rowkey\tval_a\t1234");
221 Pair<Integer, Integer> rowKeyOffsets = parser.parseRowKey(line, line.length);
222 assertEquals(0, rowKeyOffsets.getFirst().intValue());
223 assertEquals(6, rowKeyOffsets.getSecond().intValue());
224 try {
225 line = Bytes.toBytes("\t\tval_a\t1234");
226 parser.parseRowKey(line, line.length);
227 fail("Should get BadTsvLineException on empty rowkey.");
228 } catch (BadTsvLineException b) {
229
230 }
231 parser = new TsvParser("col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
232 assertEquals(1, parser.getRowKeyColumnIndex());
233 line = Bytes.toBytes("val_a\trowkey\t1234");
234 rowKeyOffsets = parser.parseRowKey(line, line.length);
235 assertEquals(6, rowKeyOffsets.getFirst().intValue());
236 assertEquals(6, rowKeyOffsets.getSecond().intValue());
237 try {
238 line = Bytes.toBytes("val_a");
239 rowKeyOffsets = parser.parseRowKey(line, line.length);
240 fail("Should get BadTsvLineException when number of columns less than rowkey position.");
241 } catch (BadTsvLineException b) {
242
243 }
244 parser = new TsvParser("col_a,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
245 assertEquals(2, parser.getRowKeyColumnIndex());
246 line = Bytes.toBytes("val_a\t1234\trowkey");
247 rowKeyOffsets = parser.parseRowKey(line, line.length);
248 assertEquals(11, rowKeyOffsets.getFirst().intValue());
249 assertEquals(6, rowKeyOffsets.getSecond().intValue());
250 }
251
252 @Test
253 public void testTsvParseAttributesKey() throws BadTsvLineException {
254 TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY", "\t");
255 assertEquals(0, parser.getRowKeyColumnIndex());
256 byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value");
257 ParsedLine parse = parser.parse(line, line.length);
258 assertEquals(18, parse.getAttributeKeyOffset());
259 assertEquals(3, parser.getAttributesKeyColumnIndex());
260 String attributes[] = parse.getIndividualAttributes();
261 assertEquals(attributes[0], "key=>value");
262 try {
263 line = Bytes.toBytes("rowkey\tval_a\t1234");
264 parser.parse(line, line.length);
265 fail("Should get BadTsvLineException on empty rowkey.");
266 } catch (BadTsvLineException b) {
267
268 }
269 parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
270 assertEquals(2, parser.getRowKeyColumnIndex());
271 line = Bytes.toBytes("key=>value\tval_a\trowkey\t1234");
272 parse = parser.parse(line, line.length);
273 assertEquals(0, parse.getAttributeKeyOffset());
274 assertEquals(0, parser.getAttributesKeyColumnIndex());
275 attributes = parse.getIndividualAttributes();
276 assertEquals(attributes[0], "key=>value");
277 try {
278 line = Bytes.toBytes("val_a");
279 ParsedLine parse2 = parser.parse(line, line.length);
280 fail("Should get BadTsvLineException when number of columns less than rowkey position.");
281 } catch (BadTsvLineException b) {
282
283 }
284 parser = new TsvParser("col_a,HBASE_ATTRIBUTES_KEY,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
285 assertEquals(3, parser.getRowKeyColumnIndex());
286 line = Bytes.toBytes("val_a\tkey0=>value0,key1=>value1,key2=>value2\t1234\trowkey");
287 parse = parser.parse(line, line.length);
288 assertEquals(1, parser.getAttributesKeyColumnIndex());
289 assertEquals(6, parse.getAttributeKeyOffset());
290 String[] attr = parse.getIndividualAttributes();
291 int i = 0;
292 for(String str : attr) {
293 assertEquals(("key"+i+"=>"+"value"+i), str );
294 i++;
295 }
296 }
297
298 @Test
299 public void testTsvParserWithCellVisibilityCol() throws BadTsvLineException {
300 TsvParser parser = new TsvParser(
301 "HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY,HBASE_CELL_VISIBILITY", "\t");
302 assertEquals(0, parser.getRowKeyColumnIndex());
303 assertEquals(4, parser.getCellVisibilityColumnIndex());
304 byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value\tPRIVATE&SECRET");
305 ParsedLine parse = parser.parse(line, line.length);
306 assertEquals(18, parse.getAttributeKeyOffset());
307 assertEquals(3, parser.getAttributesKeyColumnIndex());
308 String attributes[] = parse.getIndividualAttributes();
309 assertEquals(attributes[0], "key=>value");
310 assertEquals(29, parse.getCellVisibilityColumnOffset());
311 }
312
313 }