HDFS (Hadoop Distributed File System) implementation - #42

This includes an HDFS docker image to use with the integration tests.

Co-authored-by: Ivan Andreev <ivandeex@gmail.com>
Co-authored-by: Nick Craig-Wood <nick@craig-wood.com>
This commit is contained in:
Yury Stankevich
2020-09-28 20:29:44 +03:00
committed by Nick Craig-Wood
parent 768e4c4735
commit 71edc75ca6
26 changed files with 906 additions and 0 deletions

View File

@@ -0,0 +1,42 @@
# A very minimal hdfs server for integration testing rclone
FROM debian:stretch
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends openjdk-8-jdk \
&& rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends net-tools curl python
ENV HADOOP_VERSION 3.2.1
ENV HADOOP_URL https://www.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
RUN set -x \
&& curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \
&& tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
&& rm /tmp/hadoop.tar.gz*
RUN ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop
RUN mkdir /opt/hadoop-$HADOOP_VERSION/logs
RUN mkdir /hadoop-data
RUN mkdir -p /hadoop/dfs/name
RUN mkdir -p /hadoop/dfs/data
ENV HADOOP_HOME=/opt/hadoop-$HADOOP_VERSION
ENV HADOOP_CONF_DIR=/etc/hadoop
ENV MULTIHOMED_NETWORK=1
ENV USER=root
ENV PATH $HADOOP_HOME/bin/:$PATH
ADD core-site.xml /etc/hadoop/core-site.xml
ADD hdfs-site.xml /etc/hadoop/hdfs-site.xml
ADD httpfs-site.xml /etc/hadoop/httpfs-site.xml
ADD kms-site.xml /etc/hadoop/kms-site.xml
ADD mapred-site.xml /etc/hadoop/mapred-site.xml
ADD yarn-site.xml /etc/hadoop/yarn-site.xml
ADD run.sh /run.sh
RUN chmod a+x /run.sh
CMD ["/run.sh"]

View File

@@ -0,0 +1,32 @@
# Test HDFS
This is a docker image for rclone's integration tests which runs an
hdfs filesystem in a docker image.
## Build
```
docker build --rm -t rclone/test-hdfs .
docker push rclone/test-hdfs
```
# Test
configure remote:
```
[TestHdfs]
type = hdfs
namenode = 127.0.0.1:8020
username = root
```
run tests
```
cd backend/hdfs
GO111MODULE=on go test -v
```
stop docker image:
```
docker kill rclone-hdfs
```

View File

@@ -0,0 +1,6 @@
<configuration>
<property><name>fs.defaultFS</name><value>hdfs://localhost:8020</value></property>
<property><name>hadoop.http.staticuser.user</name><value>root</value></property>
<property><name>hadoop.proxyuser.root.groups</name><value>root,nogroup</value></property>
<property><name>hadoop.proxyuser.root.hosts</name><value>*</value></property>
</configuration>

View File

@@ -0,0 +1,14 @@
<configuration>
<property><name>dfs.client.use.datanode.hostname</name><value>true</value></property>
<property><name>dfs.datanode.data.dir</name><value>file:///hadoop/dfs/data</value></property>
<property><name>dfs.datanode.use.datanode.hostname</name><value>true</value></property>
<property><name>dfs.namenode.accesstime.precision</name><value>3600000</value></property>
<property><name>dfs.namenode.http-bind-host</name><value>0.0.0.0</value></property>
<property><name>dfs.namenode.https-bind-host</name><value>0.0.0.0</value></property>
<property><name>dfs.namenode.name.dir</name><value>file:///hadoop/dfs/name</value></property>
<property><name>dfs.namenode.rpc-bind-host</name><value>0.0.0.0</value></property>
<property><name>dfs.namenode.safemode.extension</name><value>5000</value></property>
<property><name>dfs.namenode.servicerpc-bind-host</name><value>0.0.0.0</value></property>
<property><name>dfs.replication</name><value>2</value></property>
<property><name>nfs.dump.dir</name><value>/tmp</value></property>
</configuration>

View File

@@ -0,0 +1,2 @@
<configuration>
</configuration>

View File

@@ -0,0 +1,2 @@
<configuration>
</configuration>

View File

@@ -0,0 +1,5 @@
<configuration>
<property><name>mapreduce.framework.name</name><value>yarn</value></property>
<property><name>yarn.nodemanager.bind-host</name><value>0.0.0.0</value></property>
</configuration>

View File

@@ -0,0 +1,8 @@
#!/bin/bash
echo format namenode
hdfs namenode -format test
hdfs namenode &
hdfs datanode &
exec sleep infinity

View File

@@ -0,0 +1,14 @@
<configuration>
<property><name>yarn.log-aggregation-enable</name><value>true</value></property>
<property><name>yarn.log.server.url</name><value>http://localhost:8188/applicationhistory/logs/</value></property>
<property><name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name><value>org.apache.hadoop.mapred.ShuffleHandler</value></property>
<property><name>yarn.nodemanager.aux-services</name><value>mapreduce_shuffle</value></property>
<property><name>yarn.nodemanager.bind-host</name><value>0.0.0.0</value></property>
<property><name>yarn.nodemanager.bind-host</name><value>0.0.0.0</value></property>
<property><name>yarn.nodemanager.remote-app-log-dir</name><value>/app-logs</value></property>
<property><name>yarn.timeline-service.bind-host</name><value>0.0.0.0</value></property>
<property><name>yarn.timeline-service.enabled</name><value>true</value></property>
<property><name>yarn.timeline-service.generic-application-history.enabled</name><value>true</value></property>
<property><name>yarn.timeline-service.hostname</name><value>historyserver.hadoop</value></property>
<property><name>yarn.timeline-service.leveldb-timeline-store.path</name><value>/hadoop/yarn/timeline</value></property>
</configuration>

View File

@@ -0,0 +1,24 @@
#!/bin/bash
set -e
NAME=rclone-hdfs
. $(dirname "$0")/docker.bash
start() {
docker run --rm -d --name "rclone-hdfs" -p 127.0.0.1:9866:9866 -p 127.0.0.1:8020:8020 --hostname "rclone-hdfs" rclone/test-hdfs
sleep 10
echo type=hdfs
echo namenode=127.0.0.1:8020
echo user=root
}
stop() {
if status ; then
docker kill $NAME
echo "$NAME stopped"
fi
}
. $(dirname "$0")/run.bash