Hướng dẫn cài đặt hadoop

Yêu cầu:

  • Các máy cài Linux 64bit

  • Mỗi máy đều có chung username (ví dụ: bi_app1)

  • Các máy phải kết nối ssh được với nhau

  • Các máy đều cài đặt java jdk 6 trở lên

  • Các máy tạo đều tạo user bi_app1 thuộc group hdgroup

Thực hiện: tất cả các máy đều tương tự, kể cả namenode

  • Giải nén hadoop bundle vào thư mục ~

  • Thêm vào file ~/.bashrc với nội dung như sau:

# Set Hadoop-related environment variables

export HADOOP_HOME=/u01/app/bi_app1/hadoop/hadoop-2.2.0

# Set JAVA_HOME (we will also configure JAVA_HOME directly for Hadoop later on)

export JAVA_HOME=/u01/app/bi_app1/java/jdk1.7.0_55

export HADOOP_MAPRED_HOME=$HADOOP_HOME

export HADOOP_COMMON_HOME=$HADOOP_HOME

export HADOOP_HDFS_HOME=$HADOOP_HOME

export YARN_HOME=$HADOOP_HOME

export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop

 

export PATH=$JAVA_HOME/bin:$PATH:$HADOOP_HOME/bin

export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native

export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

# Some convenient aliases and functions for running Hadoop-related commands

unalias fs &> /dev/null

alias fs="hadoop fs"

unalias hls &> /dev/null

alias hls="fs -ls"

 

# If you have LZO compression enabled in your Hadoop cluster and

# compress job outputs with LZOP (not covered in this tutorial):

# Conveniently inspect an LZOP compressed file from the command

# line; run via:

#

# $ lzohead /hdfs/path/to/lzop/compressed/file.lzo

#

# Requires installed 'lzop' command.

#

lzohead () {

   hadoop fs -cat $1 | lzop -dc | head -1000 | less

}
  • Thêm vào file /etc/hosts trên tất cả các node kể cả namenode (địa chỉ IP tùy theo thực tế)

10.58.71.237 host237
10.58.71.245 host245
  • Sửa file $HADOOP_HOME/ etc/hadoop/hadoop-env.sh trên tất cả các node kể cả namenode

Thêm dòng sau:

export HADOOP_OPTS=-Djava.net.preferIPv4Stack=true

         Lý do: phiên bản hiện tại của hadoop chưa tương thích với Ipv6 nên phải disable Ipv6 đi

  • Trên tất cả các máy thực hiện

chmod –R 700 ~/.ssh

chmod 750 ~
  • Cấu hình thực hiện như nhau trên tất cả các node, kể cả namenode, (sửa đổi tên file, đường dẫn cho phù hợp)

File $HADOOP_HOME/ etc/hadoop/hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>

<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

 

<configuration>

<property>

  <name>dfs.replication</name>

  <value>2</value>

  <description>Default block replication.

The actual number of replications can be specified when the file is created.

The default is used if replication is not specified in create time.

  </description>

</property>

<property>

    <name>dfs.permissions</name>

    <value>false</value>

</property>

<property>

  <name>dfs.name.dir</name>

  <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/namedir</value>

  <description>Determines where on the local filesystem the DFS name node

     should store the name table. If this is a comma-delimited list

     of directories then the name table is replicated in all of the

       directories, for redundancy. </description>

</property>

<property>

  <name>dfs.data.dir</name>

  <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/data</value>

</property>

<property>

  <name>dfs.namenode.handler.count</name>

  <value>2</value>

</property>

<property>

  <name>dfs.http.address</name>

  <value>host237:8031</value>

</property>

 

<property>

<name>dfs.secondary.http.address</name>

  <value>host237:8032</value>

</property>

 

<property>

  <name>fs.checkpoint.dir</name>

  <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/checkPoint</value>

</property>

<property>

  <name>fs.checkpoint.edits.dir</name>

<value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/checkPointE</value>

</property>

<property>

  <name>dfs.namenode.secondary.http-address</name>

  <value>host237:8041</value>

</property>

 

<!-- DataNode 1 -->

<property>

  <name>dfs.datanode.address</name>

  <value>host237:8051</value>

</property>

<property>

  <name>dfs.datanode.http.address</name>

  <value>host237:8052</value>

</property>

<property>

  <name>dfs.datanode.https.address</name>

  <value>host237:8053</value>

</property>

<property>

  <name>dfs.datanode.ipc.address</name>

  <value>host237:8054</value>

</property>

 

<property>

  <name>dfs.https.address</name>

  <value>host237:8034</value>

</property>

 

<property>

    <name>dfs.namenode.http-address</name>

    <value>host237:8442</value>

</property>    

<property>

    <name>dfs.namenode.https-address</name>

    <value>host237:8443</value>

</property>    

<property>

    <name>dfs.namenode.backup.address</name>

    <value>host237:8444</value>

</property>                

<property>

    <name>dfs.namenode.backup.http-address</name>

    <value>host237:8445</value>

</property>    

<property>

    <name>dfs.journalnode.rpc-address</name>

    <value>host237:8446</value>

</property>    

<property>

    <name>dfs.journalnode.http-address</name>

    <value>host237:8447</value>

</property>    

 

</configuration>
  • File $HADOOP_HOME/ etc/hadoop/mapred-site.xml

<?xml version="1.0"?>

<configuration>

 

<property>

    <name>mapreduce.framework.name</name>

    <value>yarn</value>

</property>

 

<property>

    <name>mapreduce.cluster.temp.dir</name>

    <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/tempdir</value>

    <description>No description</description>

    <final>true</final>

</property>

 

  <property>

    <name>mapreduce.cluster.local.dir</name>

    <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/localdir</value>

    <description>No description</description>

    <final>true</final>

  </property>

    <property>

         <name>mapreduce.jobtracker.http.address</name>

         <value>host237:8488</value>

    </property>

 

    <property>

         <name>mapreduce.tasktracker.http.address</name>

         <value>host245:8489</value>

    </property>

 

    <property>

         <name>mapreduce.shuffle.port</name>

         <value>8490</value>

    </property>

 

    <property>

         <name>mapreduce.jobhistory.address</name>

         <value>host237:8491</value>

    </property>

 

    <property>

         <name>mapreduce.jobhistory.webapp.address</name>

         <value>host237:8492</value>

    </property>

            <property>

            <name>yarn.app.mapreduce.am.job.client.port-range</name>

            <value>8000-9000</value>

            </property>

</configuration>
  • File $HADOOP_HOME/ etc/hadoop/core-site.xml

<?xml version="1.0" encoding="UTF-8"?>

<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

 

<configuration>

<property>

  <name>fs.default.name</name>

  <value>hdfs://host237:8024</value>

  <description>The name of the default file system. A URI whose

scheme and authority determine the FileSystem implementation. The

uri's scheme determines the config property (fs.SCHEME.impl) naming

the FileSystem implementation class. The uri's authority is used to

determine the host, port, etc. for a filesystem.</description>

</property>

<property>

    <name>hadoop.tmp.dir</name>

    <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/tmp</value>

  </property>

<property>

  <name>fs.inmemory.size.mb</name>

  <value>100</value>

</property>

<property>

  <name>io.sort.factor</name>

  <value>50</value>

</property>

<property>

  <name>io.sort.mb</name>

  <value>100</value>

</property>

</configuration>
  • File $HADOOP_HOME/ etc/hadoop/yarn-site.xml (chỉ trên máy master)

<?xml version="1.0"?>

 

<configuration>

 

<!-- Site specific YARN configuration properties -->

<property>

    <name>yarn.resourcemanager.resource-tracker.address</name>

    <value>host237:8031</value>

    <description>host is the hostname of the resource manager and

   port is the port on which the NodeManagers contact the Resource Manager.

    </description>

  </property>

 

  <property>

    <name>yarn.resourcemanager.scheduler.address</name>

    <value>host237:8030</value>

    <description>host is the hostname of the resourcemanager and port is the port

   on which the Applications in the cluster talk to the Resource Manager.

    </description>

  </property>

 

  <property>

    <name>yarn.resourcemanager.scheduler.class</name>

    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>

    <description>In case you do not want to use the default scheduler</description>

  </property>

 

  <property>

    <name>yarn.resourcemanager.address</name>

    <value>host237:8032</value>

    <description>the host is the hostname of the ResourceManager and the port is the port on

   which the clients can talk to the Resource Manager. </description>

  </property>

 

  <property>

    <name>yarn.nodemanager.local-dirs</name>

    <value>/u01/app/bi_app1/spark/spark-0.9.2/tmp</value>

    <description>the local directories used by the nodemanager</description>

  </property>

 

<property>

    <name>yarn.nodemanager.address</name>

    <value>host237:9005</value>

    <description>the nodemanagers bind to this port</description>

  </property>

  <property>

    <name>yarn.nodemanager.resource.memory-mb</name>

   <value>8000</value>

    <description>the amount of memory on the NodeManager in GB</description>

  </property>

  <property>

    <name>yarn.nodemanager.remote-app-log-dir</name>

    <value>/applogs</value>

    <description>directory on hdfs where the application logs are moved to </description>

  </property>

 

    <property>

    <name>yarn.nodemanager.log-dirs</name>

    <value>/u01/app/bi_app1/hadoop/hadoop-2.2.0/nodelogs</value>

    <description>the directories used by Nodemanagers as log directories</description>

  </property>

 

  <property>

    <name>yarn.nodemanager.aux-services</name>

    <value>mapreduce_shuffle</value>

    <description>shuffle service that needs to be set for Map Reduce to run </description>

  </property>

    <property>

    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>

    <value>org.apache.hadoop.mapred.ShuffleHandler</value>

  </property>

 

<property>

    <name>yarn.resourcemanager.webapp.address</name>

    <value>host237:8065</value>

  </property>

  <property>

 <name>yarn.web-proxy.address</name>

    <value>host237:8066</value>

  </property>

    <property>

    <name>yarn.resourcemanager.admin.address</name>

    <value>host237:8067</value>

  </property>

  <property>

    <name>yarn.scheduler.minimum-allocation-mb</name>

    <value>1000</value>

  </property>

  <property>

    <name>yarn.scheduler.maximum-allocation-mb</name>

    <value>8000</value>

  </property>

    <property>

         <name>yarn.resourcemanager.rm.container-allocation.expiry-interval-ms</name>

         <value>1000000</value>

    </property>

            <property>

            <name>yarn.app.mapreduce.am.job.client.port-range</name>

            <value>8000-9000</value>

            </property>

 

            <property>

         <name>yarn.application.classpath</name>

         <value>

             $HADOOP_HOME/etc/hadoop,

             $HADOOP_HOME/share/hadoop/common/*,

             $HADOOP_HOME/share/hadoop/common/lib/*,

             $HADOOP_HOME/share/hadoop/hdfs/*,

             $HADOOP_HOME/share/hadoop/hdfs/lib/*,

             $HADOOP_HOME/share/hadoop/mapreduce/*,

             $HADOOP_HOME/share/hadoop/mapreduce/lib/*,

             $HADOOP_HOME/share/hadoop/yarn/*,

             $HADOOP_HOME/share/hadoop/yarn/lib/*

         </value>

    </property>

</configuration>
  • File $HADOOP_HOME/ etc/hadoop/masters (chỉ trên máy master)

host237
  • File $HADOOP_HOME/ etc/hadoop/slaves (chỉ trên máy master)

host237

host245
  • Cấu hình SSH passwordless chỉ trên máy master

ssh-keygen -t rsa -P ""               (nhấn enter để chọn mặc định)

cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys

Cấu hình passwordless cho các máy slave, nhưng gõ lệnh trên máy master

ssh-copy-id -i $HOME/.ssh/id_rsa.pub bi_app1@ host245

ssh bi_app1@ host245

ssh-copy-id -i $HOME/.ssh/id_rsa.pub bi_app1@ host237

ssh bi_app1@ host237
  • Khởi động hệ thống

Trên máy master gõ lệnh

sbin/start-all.sh

  • Thực hiện kiểm tra trên các máy bằng lệnh: jps

Trên master tối thiểu phải có thông tin sau:

11669 DataNode

11551 NameNode

11831 SecondaryNameNode

12101 NodeManager

11980 ResourceManager

Trên slave tối thiểu phải có thông tin sau:

716 NodeManager

604 DataNode

Advertisement

Privacy Settings

Share this:

Like this:

Like

Loading…