Skip to content

Commit

Permalink
[apache#554] improvement(CI-Hive): improve Hadoop access permission (a…
Browse files Browse the repository at this point in the history
…pache#1194)

### What changes were proposed in this pull request?
 - set `hdfs` as HDFS superuser group in container
- use `datastrato` as the Hive catalog Integration Test user instead of
`root`

### Why are the changes needed?
we should not use the `root` user directly to access HDFS

Fix: apache#554 

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
existing ITs
  • Loading branch information
mchades authored Dec 19, 2023
1 parent 4bafbd0 commit 9dddd0c
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 21 deletions.
2 changes: 1 addition & 1 deletion dev/docker/build-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ fi

if [[ "${component_type}" == "hive" ]]; then
. ${script_dir}/hive/hive-dependency.sh
build_args="--build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME}"
build_args="--build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME} --build-arg JDBC_DIVER_PACKAGE_NAME=${JDBC_DIVER_PACKAGE_NAME}"
elif [ "${component_type}" == "trino" ]; then
. ${script_dir}/trino/trino-dependency.sh
elif [ "${component_type}" == "gravitino" ]; then
Expand Down
6 changes: 2 additions & 4 deletions dev/docker/hive/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ LABEL maintainer="[email protected]"

ARG HADOOP_PACKAGE_NAME
ARG HIVE_PACKAGE_NAME
ARG JDBC_DIVER_PACKAGE_NAME

WORKDIR /

Expand Down Expand Up @@ -129,10 +130,7 @@ RUN sed -i "s/.*bind-address.*/bind-address = 0.0.0.0/" /etc/mysql/mysql.conf.d/

################################################################################
# add mysql jdbc driver
RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.15.tar.gz
RUN tar -xzf mysql-connector-java-8.0.15.tar.gz
RUN cp mysql-connector-java-8.0.15/mysql-connector-java-8.0.15.jar ${HIVE_HOME}/lib
RUN rm -rf mysql-connector-java-8.0.15 mysql-connector-java-8.0.15.tar.gz
RUN tar -xz -C ${HIVE_HOME}/lib --strip-components 1 -f /tmp/packages/${JDBC_DIVER_PACKAGE_NAME}

################################################################################
# add users and groups
Expand Down
5 changes: 5 additions & 0 deletions dev/docker/hive/hdfs-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,9 @@
<name>dfs.datanode.address</name>
<value>0.0.0.0:50010</value>
</property>

<property>
<name>dfs.permissions.superusergroup</name>
<value>hdfs</value>
</property>
</configuration>
8 changes: 8 additions & 0 deletions dev/docker/hive/hive-dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,17 @@ hive_dir="$(cd "${hive_dir}">/dev/null; pwd)"
# Environment variables definition
HADOOP_VERSION="2.7.3"
HIVE_VERSION="2.3.9"
MYSQL_JDBC_DRIVER_VERSION="8.0.15"

HADOOP_PACKAGE_NAME="hadoop-${HADOOP_VERSION}.tar.gz" # Must export this variable for Dockerfile
HADOOP_DOWNLOAD_URL="http://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/${HADOOP_PACKAGE_NAME}"

HIVE_PACKAGE_NAME="apache-hive-${HIVE_VERSION}-bin.tar.gz" # Must export this variable for Dockerfile
HIVE_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/${HIVE_PACKAGE_NAME}"

JDBC_DIVER_PACKAGE_NAME="mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}.tar.gz" # Must export this variable for Dockerfile
JDBC_DIVER_DOWNLOAD_URL="https://downloads.mysql.com/archives/get/p/3/file/${JDBC_DIVER_PACKAGE_NAME}"

# Prepare download packages
if [[ ! -d "${hive_dir}/packages" ]]; then
mkdir -p "${hive_dir}/packages"
Expand All @@ -29,3 +33,7 @@ fi
if [ ! -f "${hive_dir}/packages/${HIVE_PACKAGE_NAME}" ]; then
curl -s -o "${hive_dir}/packages/${HIVE_PACKAGE_NAME}" ${HIVE_DOWNLOAD_URL}
fi

if [ ! -f "${hive_dir}/packages/${JDBC_DIVER_PACKAGE_NAME}" ]; then
curl -L -s -o "${hive_dir}/packages/${JDBC_DIVER_PACKAGE_NAME}" ${JDBC_DIVER_DOWNLOAD_URL}
fi
10 changes: 0 additions & 10 deletions dev/docker/hive/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,6 @@ ssh-keyscan 0.0.0.0 >> /root/.ssh/known_hosts
# start hdfs
${HADOOP_HOME}/sbin/start-dfs.sh

${HADOOP_HOME}/bin/hdfs dfs -mkdir /tmp
${HADOOP_HOME}/bin/hdfs dfs -chmod 1777 /tmp
${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/hive/warehouse
${HADOOP_HOME}/bin/hdfs dfs -chown -R hive:hive /user/hive
${HADOOP_HOME}/bin/hdfs dfs -chmod -R 775 /user/hive
${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/datastrato
${HADOOP_HOME}/bin/hdfs dfs -chown -R datastrato:hdfs /user/datastrato
${HADOOP_HOME}/bin/hdfs dfs -chmod 755 /user/datastrato
${HADOOP_HOME}/bin/hdfs dfs -chmod -R 777 /user/hive/tmp

# start mysql and create databases/users for hive
chown -R mysql:mysql /var/lib/mysql
usermod -d /var/lib/mysql/ mysql
Expand Down
6 changes: 5 additions & 1 deletion docs/docker-image-details.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,12 @@ You can use this kind of images to test the catalog of Apache Hive.

Changelog

- gravitino-ci-hive:0.1.7
- Download MySQL JDBC driver before building the Docker image
- Set `hdfs` as HDFS superuser group

- gravitino-ci-hive:0.1.6
- No start YARN when container startup
- No starting YARN when container startup
- Removed expose ports:
- `22` SSH
- `8088` YARN Service
Expand Down
5 changes: 2 additions & 3 deletions integration-test/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,13 @@ tasks.test {

// Default use MiniGravitino to run integration tests
environment("GRAVITINO_ROOT_DIR", rootDir.path)
// TODO: use hive user instead after we fix the permission issue #554
environment("HADOOP_USER_NAME", "root")
environment("HADOOP_USER_NAME", "datastrato")
environment("HADOOP_HOME", "/tmp")
environment("PROJECT_VERSION", version)
environment("TRINO_CONF_DIR", buildDir.path + "/trino-conf")

// Gravitino CI Docker image
environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-hive:0.1.6")
environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-hive:0.1.7")
environment("GRAVITINO_CI_TRINO_DOCKER_IMAGE", "datastrato/gravitino-ci-trino:0.1.2")

val testMode = project.properties["testMode"] as? String ?: "embedded"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,17 @@
import com.datastrato.gravitino.rel.types.Types;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.Database;
Expand Down Expand Up @@ -106,6 +111,7 @@ public class CatalogHiveIT extends AbstractIT {
private static GravitinoMetaLake metalake;
private static Catalog catalog;
private static SparkSession sparkSession;
private static FileSystem hdfs;
private static final String SELECT_ALL_TEMPLATE = "SELECT * FROM %s.%s";
private static final String INSERT_WITHOUT_PARTITION_TEMPLATE = "INSERT INTO %s.%s VALUES (%s)";
private static final String INSERT_WITH_PARTITION_TEMPLATE =
Expand Down Expand Up @@ -155,13 +161,23 @@ public static void startup() throws Exception {
.config("mapreduce.input.fileinputformat.input.dir.recursive", "true")
.enableHiveSupport()
.getOrCreate();

Configuration conf = new Configuration();
conf.set(
"fs.defaultFS",
String.format(
"hdfs://%s:%d",
containerSuite.getHiveContainer().getContainerIpAddress(),
HiveContainer.HDFS_DEFAULTFS_PORT));
hdfs = FileSystem.get(conf);

createMetalake();
createCatalog();
createSchema();
}

@AfterAll
public static void stop() {
public static void stop() throws IOException {
client.dropMetalake(NameIdentifier.of(metalakeName));
if (hiveClientPool != null) {
hiveClientPool.close();
Expand All @@ -170,6 +186,10 @@ public static void stop() {
if (sparkSession != null) {
sparkSession.close();
}

if (hdfs != null) {
hdfs.close();
}
try {
closer.close();
} catch (Exception e) {
Expand Down Expand Up @@ -286,6 +306,19 @@ private void checkTableReadWrite(org.apache.hadoop.hive.metastore.api.Table tabl
}
Assertions.assertEquals(
count + 1, sparkSession.sql(String.format(SELECT_ALL_TEMPLATE, dbName, tableName)).count());
// Assert HDFS owner
Path tableDirectory = new Path(table.getSd().getLocation());
FileStatus[] fileStatuses;
try {
fileStatuses = hdfs.listStatus(tableDirectory);
} catch (IOException e) {
LOG.warn("Failed to list status of table directory", e);
throw new RuntimeException(e);
}
Assertions.assertTrue(fileStatuses.length > 0);
for (FileStatus fileStatus : fileStatuses) {
Assertions.assertEquals("datastrato", fileStatus.getOwner());
}
}

private Map<String, String> createProperties() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ public void startHiveContainer() {
HiveContainer.builder()
.withHostName("gravitino-ci-hive")
.withEnvVars(
ImmutableMap.<String, String>builder().put("HADOOP_USER_NAME", "root").build())
ImmutableMap.<String, String>builder()
.put("HADOOP_USER_NAME", "datastrato")
.build())
.withNetwork(network);
hiveContainer = closer.register(hiveBuilder.build());
hiveContainer.start();
Expand Down

0 comments on commit 9dddd0c

Please sign in to comment.