c++读取HDFS HarFileSystem archive文件示例

关键就是创建fs的时候,需要指明schema:har://scheme-hostname:port/archivepath/fileinarchive
到最上层的archive文件目录即可。
fs = hdfsConnect(“har://hdfs-172.16.48.4:9000/qhl/real_archive_max/archive_max.har/“,9000);

需要注意的是,上面必须要以”/“结尾,否则hdfs就会认为是文件,而不是目录,就会报文件不存在的错误。

执行命令如下, hello.csv_4是archive_max.har里面归档前的一个文件。即har://hdfs-172.16.48.4:9000/qhl/real_archive_max/archive_max.har/hello.csv_4是一个完成的归档后的文件。

1
./hdfs_small_file_optim hello.csv_4 100 100

附录

编译脚本

1
2
3
4
#!/bin/bash
source /etc/profile
source ~/.bash_profile
gcc hdfs_small_file_optim.c -I/home/test/hadoop/include/ -L/home/test/hadoop/lib/native -lhdfs -o hdfs_small_file_optim

测试代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include "hdfs.h"

#include <stdio.h>
#include <stdlib.h>

/**
* An example of using libhdfs to read files. The usage of this file is as follows:
*
* Usage: hdfs_read <filename> <filesize> <buffersize>
*/
int main(int argc, char **argv) {
hdfsFS fs;
const char *rfile = argv[1];
tSize bufferSize = strtoul(argv[3], NULL, 10);
hdfsFile readFile;
char* buffer;
tSize curSize;

if (argc != 4) {
fprintf(stderr, "Usage: hdfs_read <filename> <filesize> <buffersize>\n");
exit(-1);
}
// 这里要写成这样的格式:har://scheme-hostname:port/archivepath/fileinarchive
fs = hdfsConnect("har://hdfs-172.16.48.4:9000/qhl/real_archive_max/archive_max.har/",9000);
if (!fs) {
fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
exit(-1);
}
readFile = hdfsOpenFile(fs, rfile, O_RDONLY, bufferSize, 0, 0);
if (!readFile) {
fprintf(stderr, "Failed to open %s for reading!\n", rfile);
exit(-2);
}

// data to be written to the file
buffer = malloc(sizeof(char) * bufferSize);
if(buffer == NULL) {
return -2;
}

// read from the file
curSize = bufferSize;
fprintf(stdout,"bufferSize=%d, curSize=%d\n", bufferSize, curSize);
for (; curSize == bufferSize;) {
curSize = hdfsRead(fs, readFile, (void*)buffer, curSize);
fprintf(stdout, "read length=%d\n", curSize);
}

fprintf(stdout, "hahahahah=%d\n",curSize);
fprintf(stdout, "buffer=%s\n", buffer);
free(buffer);
hdfsCloseFile(fs, readFile);
hdfsDisconnect(fs);
//sleep(1000);
return 0;
}

/**
* vim: ts=4: sw=4: et:
*/

参考文献

[1] https://hadoop.apache.org/docs/r2.6.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/HadoopArchives.html