手动方式部署
本教程适用系统 CentOS 7
本教程适用芒果版本 16C 如果安装高于此版本请替换对应 data-analysis-xxx.war 包和 data-quality-calc.py 脚本(替换 data-quality-x.x.xxx.jar 包需要更改 dags 目录下的 python 脚本中的对应参数 关键字搜索 jar 关键字)
当前部署主机IP: 172.31.32.52
当前部署主机名: spark
软件版本:
- jdk: 8u212
- apache-spark: 2.3.3-bin-hadoop2.7
- python 3.6.8
- airflow 1.10.6
初始化系统
设置主机名
hostnamectl set-hostname spark
添加 hosts 解析
IP=172.31.32.52
echo >> /etc/hosts
echo "${IP} spark" >> /etc/hosts
关闭/禁用防火墙
systemctl stop firewalld
systemctl disable firewalld
关闭/禁用 selinux
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config
配置秘钥登录
ssh-keygen -f ~/.ssh/id_rsa -t rsa -N ''
cat /root/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
echo -e "Host *\n StrictHostKeyChecking no" > ~/.ssh/config
安装 jdk
下载/解压 jdk 安装包
JDK_VER=8u211
mkdir -p /usr/java/
curl -sLO https://oss.hc-yun.com/mango_soft/jdk-${JDK_VER}-linux-x64.tar.gz
tar zxf jdk-${JDK_VER}-linux-x64.tar.gz -C /usr/java/
配置环境变量
cat <<EOF > /etc/profile.d/jdk.sh
export JAVA_HOME=/usr/java/jdk1.8.0_211
export JRE_HOME=\${JAVA_HOME}/jre
export CLASSPATH=.:\$JAVA_HOME/lib/dt.jar:\$JAVA_HOME/lib/tools.jar:\$JRE_HOME/lib:\$CLASSPATH
export PATH=\$JAVA_HOME/bin:\$PATH
EOF
读取环境变量
chmod +x /etc/profile.d/jdk.sh
source /etc/profile.d/jdk.sh
安装/配置 apache-spark
下载/解压 apache-spark
SOFT_INSTALL_DIR=/home/hadoop
mkdir -p $SOFT_INSTALL_DIR
SPARK_VER=2.3.3-bin-hadoop2.7
curl -sLO https://dl.hc-yun.com/soft/spark-${SPARK_VER}.tgz
tar xf spark-${SPARK_VER}.tgz -C ${SOFT_INSTALL_DIR}
配置环境变量
cat <<EOF >/etc/profile.d/spark.sh
export SPARK_HOME=${SOFT_INSTALL_DIR}/spark-${SPARK_VER}
export JAVA_HOME=${JAVA_HOME}
export PYSPARK_PYTHON=/usr/bin/python3
export PYSPARK_DRIVER_PYTHON=/usr/bin/python3
export PYTHONPATH=\$SPARK_HOME/python:\$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:\$PYTHONPATH
export PATH=\$PATH:\$SPARK_HOME/bin
EOF
source /etc/profile.d/spark.sh
配置 apache-spark
# config spark-env.sh
cat <<EOF > ${SPARK_HOME}/conf/spark-env.sh
source /root/airflow/venv/bin/activate
export SPARK_HOME=${SOFT_INSTALL_DIR}/spark-${SPARK_VER}
export JAVA_HOME=${JAVA_HOME}
export PYSPARK_PYTHON=/usr/bin/python3
export PYSPARK_DRIVER_PYTHON=/usr/bin/python3
export SPARK_MASTER_IP=${IP}
export SPARK_WORKER_OPTS="-Dspark.worker.cleanup.enabled=true -Dspark.worker.cleanup.interval=18000 -Dspark.worker.cleanup.appDataTtl=36000"
export SPARK_PID_DIR=${SOFT_INSTALL_DIR}/spark-${SPARK_VER}/pids
export SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=file:///tmp/spark-events"
EOF
chmod +x ${SPARK_HOME}/conf/spark-env.sh
# config slaves
echo 'localhost' > ${SPARK_HOME}/conf/slaves
# config log4j.properties.template
/usr/bin/cp ${SPARK_HOME}/conf/log4j.properties.template ${SPARK_HOME}/conf/log4j.properties
/usr/bin/cp ${SPARK_HOME}/conf/spark-defaults.conf.template ${SPARK_HOME}/conf/spark-defaults.conf
echo 'spark.eventLog.enabled true' >>${SPARK_HOME}/conf/spark-defaults.conf
echo 'spark.eventLog.dir file:///tmp/spark-events' >> ${SPARK_HOME}/conf/spark-defaults.conf
# create eventLog.dir
mkdir -p /tmp/spark-events
注意:复制 hadoop 配置文件 core-site.xml 到 /home/hadoop/spark-2.3.3-bin-hadoop2.7/conf
. 下载 mysql 驱动
cd ${SPARK_HOME}/jars
curl -sLO http://dl.hc-yun.com/soft/mysql-connector-java-5.1.38.jar
启动服务
# start spark
cd ${SPARK_HOME}/sbin
./start-all.sh
./start-history-server.sh
将启动命令放到 rc.local
cat >> /etc/rc.local <<EOF
# start spark
cd ${SPARK_HOME}/sbin && ./start-all.sh && ./start-history-server.sh
EOF
验证
Master: http://172.31.32.52:8080
Worker: http://172.31.32.52:8081
HistoryServerr: http://172.31.32.52:18080
### 安装/配置 Python-36
安装 python36
yum install -y python36
配置 pip 软件源
mkdir -p ~/.pip
cat <<EOF > ~/.pip/pip.conf
[global]
index-url = https://mirrors.aliyun.com/pypi/simple
[install]
trusted-host = mirrors.aliyun.com
EOF
创建 python3 虚拟环境
PYTHON_WORKDIR=/root/airflow/venv
/usr/bin/python3 -m venv $PYTHON_WORKDIR
source $PYTHON_WORKDIR/bin/activate
cd ${PYTHON_WORKDIR}
自动读取 python 虚拟环境
echo -e "\n# python 3.6\nsource $PYTHON_WORKDIR/bin/activate" >> ~/.bashrc
安装 apache-spark
安装依赖环境
yum install -y mysql-devel gcc gcc-c++ python36-devel libevent-devel
安装 apache-airflow
export SLUGIFY_USES_TEXT_UNIDECODE=yes
pip3 install apache-airflow[mysql,jdbc,hdfs,postgres]==1.10.6 \
–constraint “https://raw.githubusercontent.com/apache/airflow/constraints-1.10.6/constraints-3.6.txt”
yum install -y postgresql-devel
pip3 install psycopg2-binary
pip3 install pymysql
ln -sf ${PYTHON_WORKDIR}/lib/python3.6 /usr/local/lib/python3.6
ln -sf ${PYTHON_WORKDIR}/lib64/python3.6 /usr/local/lib64/python3.6
生成配置文件
export AIRFLOW_HOME=/root/airflow
export SLUGIFY_USES_TEXT_UNIDECODE=yes
cd ${AIRFLOW_HOME}
airflow version
创建数据库
mysql -uroot -p -e 'create database airflow charset=utf8;'
注意: mysql 配置文件可能需要添加参数 explicit_defaults_for_timestamp=true
配置 apache-airflow
# mysql_url='username:password@mysql_server:mysql_port/airflow'
mysql_url='airflow:airflow@172.31.32.51:3306/airflow'
sed -i 's#executor = .*#executor = LocalExecutor#' ${AIRFLOW_HOME}/airflow.cfg
sed -i 's#default_timezone =.*#default_timezone = Asia/Shanghai#' ${AIRFLOW_HOME}/airflow.cfg
sed -i "s#sql_alchemy_conn = .*#sql_alchemy_conn = mysql://${mysql_url}#" ${AIRFLOW_HOME}/airflow.cfg
sed -i 's#web_server_port = .*#web_server_port = 9090#' ${AIRFLOW_HOME}/airflow.cfg
sed -i 's#load_examples =.*#load_examples = False#' ${AIRFLOW_HOME}/airflow.cfg
修改时区
time_zone_file=${PYTHON_WORKDIR}/lib/python3.6/site-packages/airflow/utils/timezone.py
cp ${time_zone_file}{,.default}
sed -i '27a \from airflow import configuration as conf' ${time_zone_file}
sed -i '28a \try:' ${time_zone_file}
sed -i '29a \ tz = conf.get("core", "default_timezone")' ${time_zone_file}
sed -i '30a \ if tz == "system":' ${time_zone_file}
sed -i '31a \ utc = pendulum.local_timezone()' ${time_zone_file}
sed -i '32a \ else:' ${time_zone_file}
sed -i '33a \ utc = pendulum.timezone(tz)' ${time_zone_file}
sed -i '34a \except Exception:' ${time_zone_file}
sed -i '35a \ pass' ${time_zone_file}
sed -i 's#d = dt.datetime.utcnow()#d = dt.datetime.now()#' ${time_zone_file}
time_sqlalchemy_file=${PYTHON_WORKDIR}/lib/python3.6/site-packages/airflow/utils/sqlalchemy.py
cp ${time_sqlalchemy_file}{,.default}
sed -i '37a \from airflow import configuration as conf' ${time_sqlalchemy_file}
sed -i '38a \try:' ${time_sqlalchemy_file}
sed -i '39a \ tz = conf.get("core", "default_timezone")' ${time_sqlalchemy_file}
sed -i '40a \ if tz == "system":' ${time_sqlalchemy_file}
sed -i '41a \ utc = pendulum.local_timezone()' ${time_sqlalchemy_file}
sed -i '42a \ else:' ${time_sqlalchemy_file}
sed -i '43a \ utc = pendulum.timezone(tz)' ${time_sqlalchemy_file}
sed -i '44a \except Exception:' ${time_sqlalchemy_file}
sed -i '45a \ pass' ${time_sqlalchemy_file}
time_html_file=${PYTHON_WORKDIR}/lib/python3.6/site-packages/airflow/www/templates/admin/master.html
cp ${time_html_file}{,.default}
sed -i 's#var UTCseconds =.*#var UTCseconds = x.getTime();#' ${time_html_file}
sed -i 's#"timeFormat":"H:i:s %UTC%",#"timeFormat":"H:i:s",#' ${time_html_file}
初始化
airflow initdb
启动服务
PYTHON_WORKDIR=/root/airflow/venv
source ${PYTHON_WORKDIR}/bin/activate
airflow webserver -p 9090 -D
airflow scheduler -D
将启动命令放到 rc.local
cat >> /etc/rc.local <<EOF
# start airflow
PYTHON_WORKDIR=/root/airflow/venv
source ${PYTHON_WORKDIR}/bin/activate
airflow webserver -p 9090 -D
airflow scheduler -D
EOF
验证
http://172.31.32.52:9090
### 配置 DataQualityPython
下载解压代码
cd /home/hadoop
curl -sLO http://dl.hc-yun.com/soft/DataQualityPython.zip
unzip DataQualityPython.zip
安装 python 依赖模块
source /root/airflow/venv/bin/activate
pip3 install Django==2.1.8
pip3 install scipy
配置/测试
配置允许访问IP列表(将当前IP添加到列表中)
sed -i "s#ALLOWED_HOSTS =.*#ALLOWED_HOSTS = ['localhost', '127.0.0.1', '${IP}']#" /home/hadoop/DataQualityPython/DataQualityPython/settings.py
测试启动
# 生成 db.sqlite3
cd DataQualityPython
python3 manage.py migrate
# 测试服务是否正常 启动后 Ctrl + C 关闭以 uwsgi+nginx 代理方式运行
python3 manage.py runserver 0.0.0.0:7070
安装/配置 uwsgi
安装 uwsgi
pip3 install uwsgi
创建配置文件
cat > uwsgi.ini <<EOF
[uwsgi]
#使用HTTP访问的端口号, 使用这个端口号是直接访问了uWSGI, 绕过了Nginx
#http = :8010
# 与外界连接的端口号, Nginx通过这个端口转发给uWSGI
socket = 127.0.0.1:7071
# 是否使用主线程
master = true
# 项目在服务器中的目录(绝对路径)
chdir = /home/hadoop/DataQualityPython
# Django's wsgi 文件目录
wsgi-file = /home/hadoop/DataQualityPython/DataQualityPython/wsgi.py
# 最大进程数
processes = 4
# 每个进程的线程数
threads = 2
# 状态监听端口
stats = 127.0.0.1:9191
# 退出时自动清理环境配置
vacuum = true
# 目录下文件改动时自动重启
touch-reload = /home/hadoop/DataQualityPython
# Python文件改动时自动重启
py-auto-reload = 1
# 后台运行并把日志存到.log文件
daemonize = /home/hadoop/DataQualityPython/logs/uWSGI.log
pidfile = uwsgi.pid
EOF
启动服务 将启动命令放到 rc.local
uwsgi --ini /home/hadoop/DataQualityPython/uwsgi.ini
echo -e "\n# start DataQualityPython\nuwsgi --ini /home/hadoop/DataQualityPython/uwsgi.ini" >> /etc/rc.local
### 5. 安装/配置 nginx
安装 epel 软件源
yum install -y epel-release
安装 nginx
yum install -y nginx
创建 nginx 配置文件
mv /etc/nginx/nginx.conf{,.bak}
cat <<'EOF' > /etc/nginx/nginx.conf
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log;
pid /run/nginx.pid;
include /usr/share/nginx/modules/*.conf;
events {
worker_connections 1024;
}
http {
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;
include /etc/nginx/mime.types;
default_type application/octet-stream;
include /etc/nginx/conf.d/*.conf;
server {
listen 7070 default_server;
server_name _;
charset utf-8;
client_max_body_size 75M;
include /etc/nginx/default.d/*.conf;
location /static {
alias /home/hadoop/DataQualityPython/static;
}
location / {
uwsgi_pass 127.0.0.1:7071;
include /etc/nginx/uwsgi_params;
}
error_page 404 /404.html;
location = /40x.html {
}
error_page 500 502 503 504 /50x.html;
location = /50x.html {
}
}
}
EOF
启动服务 跟随系统启动
systemctl start nginx
systemctl enable nginx
echo -e "\n# start nginxn\nsystemctl start nginx" >> /etc/rc.local
### 6. 验证
http://172.31.32.52:7070/admin
### 配置 data-quality
下载 jar 包
mkdir -p /home/hadoop/data-quality
cd /home/hadoop/data-quality
curl -sLO http://122.114.2.195:16001/16B/data-quality-1.1.4.0004.jar
创建配置文件
cat <<EOF > application.properties
# MySQL 线程池配置
driverClassName=com.mysql.jdbc.Driver
url=jdbc:mysql://172.31.32.145:3306/loong?useUnicode=true&characterEncoding=utf-8&useSSL=false
username=root
password=zaq1@WSX
initialSize=10
minIdle=5
maxActive=500
# Hazelcast 缓存
hz.groupName=mango-jenkins
hz.password=dev-pass
hz.network=172.31.32.145:5701
# Kafka Configuration
kafka.acks=1
kafka.bootstrap-servers=172.31.32.141:9092,172.31.32.142:9092,172.31.32.143:9092
kafka.recalcDataTopic=recalcDataTopic
# Hadoop fs.defaultFS 值
hdfs=hdfs://node01:9000
# Opentsdb Zookeeper
tsd.storage.hbase.zk_quorum=172.31.32.141:2181,172.31.32.142:2181,172.31.32.143:2181
EOF
注意: 修改以上配置信息为实际芒果集群配置修改 hdfs 值为 core-site.xml 配置中的 fs.defaultFS 的值(spark配置部分有说明)
配置 data-quality-python
下载 python 脚本
mkdir -p /home/hadoop/data-quality-python/jobs
cd /home/hadoop/data-quality-python/jobs
curl -sLO http://122.114.2.195:16001/16B/data-quality-calc.py
创建配置文件
cat <<EOF > config.ini
[spark]
master=spark://master:7077
master-test=spark://${IP}:7077
[hadoop]
hdfs=hdfs://node01:9000
[path_data]
calc_dir=/dataQuality/calc/data/
recalc_dir=/dataQuality/recalc/data/
[path_result]
calc_dir=/dataQuality/calc/result/
recalc_dir=/dataQuality/recalc/result/
EOF
注意 修改 /home/hadoop/data-quality-python/jobs/config.ini hdfs的值为 core-site.xml 配置中的 fs.defaultFS 的值(spark配置部分有说明)
### 配置 airflow
# 浏览器打开[Admin] → [Connections] → [spark_default]
# yarn 改成 spark://spark:7077
http://172.31.32.52:9090/admin/connection/
# 重启 airlfow 后,启用定时任务
common_operation_dag
data_quality_calc
data_quality_recalc
using_length_dag
脚本方式部署
下载部署脚本
curl -sLO https://dl.hc-yun.com/script/airflow-spark.sh
chmod +x airflow-spark.sh
./airflow-spark.sh
注意: 部署完成后根据 安装完成后的输出修改对应配置及验证服务状态
验证
airflow:
http://172.31.32.52:9090
spark:
Master: http://172.31.32.52:8080
Worker: http://172.31.32.52:8081
HistoryServerr: http://172.31.32.52:18080
DataQuality:
http://172.31.32.52:7070/admin
容器方式部署
1. 配置主机名
# 设置主机名
hostnamectl set-hostname spark
# 添加 hosts 解析
echo >> /etc/hosts
echo "${IP} spark" >> /etc/hosts
2. 安装docker 环境
# 安装最新版本 Docker CE
bash <(curl -sSL https://dwz.cn/XOJj0Njx) -i docker
# 安装 docker-compose
bash <(curl -sSL https://dwz.cn/XOJj0Njx) -i compose
3. 下载解压资源
cd /home
curl -OL http://122.114.2.195:16001/airflow/home-hadoop.tar.gz
curl -OL http://122.114.2.195:16001/airflow/airflow-spark.tar.gz
curl -OL http://122.114.2.195:16001/airflow/supervisor-airflow-1.10.6.tar.gz # 如果使用在线镜像可不用下载此镜像
tar xf home-hadoop.tar.gz
tar xf airflow-spark.tar.gz
4. 导入镜像
# 导入镜像(如果使用在线镜像可忽略此步骤)
gunzip -c supervisor-airflow-1.10.6.tar.gz | docker load
5. 配置
- 1. 复制 hadoop 配置文件 core-site.xml 到 /home/hadoop/spark-2.3.3-bin-hadoop2.7/conf
- 2. 修改 /home/hadoop/data-quality-python/jobs/config.ini hdfs的值为 core-site.xml 配置中的 fs.defaultFS 的值
- 3. 修改 /home/hadoop/data-quality/application.properties 中的配置信息为芒果服务对应参数 hdfs 值同上
- 4. 修改 /home/hadoop/DataQualityPython/DataQualityPython/settings.py 配置中的 ALLOWED_HOSTS 参数值加上 airflow/spark 服务器IP地址
6. 启动容器
cd airflow-spark
docker-compose up -d
- 浏览器打开 web 页面 [Admin] → [Connections] → [spark_default] http://172.31.32.52:9090/admin/connection/
- 搜索 yarn 改成 spark://spark:7077 修改完成后重启容器
评论区