大数据学习心得与总结

2023-05-16

作业一 hadoop搭建伪分布式*
1.关闭防火墙
systemctl disable firewalld
2.传输JDK和HADOOP压缩包(版本自行留意)
这里建议使用xshell和xftp6
3.解压JDK、HADOOP(这里解压安装至opt/module目录)
tar -zxvf jdk-8u121-linux-x64.tar.gz -C /opt/module
tar -zxvf hadoop-2.7.3.tar.gz -C /opt/module
4.配置JDK和hadoop并生效
vi /etc/profile文件添加
export JAVA_HOME=/opt/module/jdk1.8.0_121
export HADOOP_HOME=/opt/module/hadoop-2.7.3
export PATH= J A V A H O M E / b i n : JAVA_HOME/bin: JAVAHOME/bin:PATH
Esc :wq!保存并退出
source /etc/profile配置生效
5.修改以下4个配置文件
(这里以主机名bigdata128为例)

①core-site.xml

<property>
    <name>fs.defaultFS</name>
       <value>hdfs://bigdata128:9000</value>
</property>
<property>
      <name>hadoop.tmp.dir</name>
       <value>/opt/module/hadoop-2.7.3/tmp</value>
</property> 

②hdfs-site.xml

<property>                
              <name>dfs.replication</name>           
              <value>1</value>                   
</property>             
<property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>bigdata128:50090</value>
</property>

③mapred-site.xml(该配置文件不存在,先复制)
cp mapred-site.xml.template mapred-site.xml

<property>                         
           <name>mapreduce.framework.name</name>
           <value>yarn</value> 
</property>     

④yarn-site.xml

<property>                         
           <name>yarn.resourcemanager.hostname</name>
           <value>bigdata128</value> 
</property>     
           
<property>       
           <name>yarn.nodemanager.aux-services</name>
           <value>mapreduce_shuffle</value> 
</property>

改\etc\hosts配置文件

运行命令 vi \etc hosts

注释掉已有内容,添加虚拟机的ip及对应主机名:

192.168.163.128 bigdata128

修改\etc\hostname配置文件

运行命令 vi \etc hostname

添加虚拟机的主机名:

bigdata128

重启虚拟机,主机名生效
7.格式化
hdfs namenode -format (如果不是第一次格式化,格式化之前先删除/opt/module/hadoop-2.7.3/下面的tmp、logs两个目录)
8.启动
start-all.sh
9.输入jps,若有5个节点,伪分布式完成。

作业二HDFS实现上传下载
1.确保安装eclipse
2.创建Java项目,实现程序代码编写。
四个程序代码
1.HDFSDownload

package hdfs.files;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import java.io.*;

public class HDFSDownload {
    //声明输入流、输出流
    private static InputStream input;
    private static OutputStream output;

    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //创建本地文件的输出流
        output = new FileOutputStream("E:\\download.txt");
        //创建HDFS的输入流
        input = client.open(new Path("/aadir/upload1.txt"));
        //写文件到HDFS
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = input.read(buffer)) != -1) {
            output.write(buffer, 0, len);
        }
        //防止输出数据不完整
        output.flush();
        //使用工具类IOUtils上传或下载
        //IOUtils.copy(input, output);
        //关闭输入输出流
        input.close();
        output.close();
        System.out.println("成功!");
    }
}


2.HDFSFilelfExist

package hdfs.files;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFSFilelfExist {
    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //声明文件对象
        String fileName = "/aadir/aaout.txt";
        //判断文件是否存在
        if (client.exists(new Path(fileName))) {
            System.out.println("文件存在!");
        } else {
            System.out.println("文件不存在!");
        }
    }
    
}


3.HDFSMKdir

package hdfs.files;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;

public class HDFSMKdir {
    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //在HDFS的根目录下创建aadir
        client.mkdirs(new Path("/aadir"));
        //关闭连接对象
        client.close();
        //输出"successful!"
        System.out.println("successfully!");
    }
}

4.HDFSUpload

package hdfs.files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

public class HDFSUpload {
    //声明输入流、输出流
    private static InputStream input;
    private static OutputStream output;

    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //创建本地文件的输入流
        input = new FileInputStream("E:\\upload.txt");
        //创建HDFS的输出流
        output = client.create(new Path("/aadir/upload1.txt"));
        //写文件到HDFS
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = input.read(buffer)) != -1) {
            output.write(buffer, 0, len);
        }
        //防止输出数据不完整
        output.flush();
        //使用工具类IOUtils上传或下载
        //IOUtils.copy(input, output);
        //关闭输入输出流
        input.close();
        output.close();
        System.out.println("成功!");
    }
}
3.分别运行各个程序,然后登陆hadoop网页查看结果。

作业三 JAVA程序实现mapreduce的wordcount
程序实现:

package hdfs.files;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountDriver {
	
	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		   
		 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		   String line = value.toString();
		 
		   String[] words=line.split(" ");
		  
		   for(String w:words) {
		  
		   context.write(new Text(w), new IntWritable(1));
		   }
		  }
	}

	public static class WordCountReducer extends Reducer <Text, IntWritable, Text, IntWritable>{
		  protected void reduce(Text Key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
			    
			    int sum=0;
			   
			    for(IntWritable v:values) {
			      sum +=v.get();
			    }
			    context.write(Key, new IntWritable(sum));
			  }
		}

	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		
		  System.setProperty("HADOOP_USER_NAME", "root");
		 
		  Configuration conf=new Configuration();
		  Job job=Job.getInstance(conf);
		 
		  job.setJarByClass(WordCountDriver.class);
		
		  job.setMapperClass(WordCountMapper.class);
	
		  job.setReducerClass(WordCountReducer.class);
		
		  job.setMapOutputKeyClass(Text.class);
		  job.setMapOutputValueClass(IntWritable.class);
	
		  job.setOutputKeyClass(Text.class);
		  job.setOutputValueClass(IntWritable.class);
		
		  FileInputFormat.setInputPaths(job, new Path("/usr/local/hdfs/input/cc.txt"));
		
		  FileOutputFormat.setOutputPath(job, new Path("/usr/local/hdfs/output"));
		
		  Boolean rs=job.waitForCompletion(true);
		
		  System.exit(rs?0:1);
	}
}

作业四 安装配置HBASE
1.官网下载安装包https://mirrors.tuna.tsinghua.edu.cn/apache/hbase/stable/
2.传输HBASE压缩包(版本自行留意)
3.解压HBASE(这里解压安装至opt/module目录)
tar -zxvf -C /opt/module
4.配置hbase并生效
vi /etc/profile文件添加
export JAVA_HOME=/opt/module/hbase(示例)
export PATH= J A V A H O M E / b i n : JAVA_HOME/bin: JAVAHOME/bin:PATH
Esc :wq!保存并退出
source /etc/profile配置生效
5.修改hbase-env.sh

export JAVA_HOME=【java安装地址】
export HBASE_CLASSPATH=【hbase安装目录】
export HBASE_MANAGES_ZK=true


6.修改hbase-site.xml

<!--hbase共享目录,持久化hbase数据-->
<!--配置为core-site.xml 中的fs.defaultFS -->
<property>
        <name>hbase.rootdir</name>
        <value>hdfs://bigdata128:9000/hbase</value>
</property>
<!--分布式运行模式,false(默认)为单机模式-->
<property>
        <name>hbase.cluster.distributed</name>
        <value>true</value>
</property>

<!--Zookeeper集群的地址列表,伪分布式用默认localhost-->
<property>
        <name>hbase.zookeeper.quorum</name>
        <value>localhost</value>
</property>

7.启动hbase
start-hbase.sh
8.java程序打包运行

程序代码:

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.client;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.concurrent.ExecutorService;

import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.security.UserProvider;


/**
 * A non-instantiable class that manages creation of {@link Connection}s.
 * Managing the lifecycle of the {@link Connection}s to the cluster is the responsibility of
 * the caller.
 * From a {@link Connection}, {@link Table} implementations are retrieved
 * with {@link Connection#getTable(TableName)}. Example:
 * <pre>
 * Connection connection = ConnectionFactory.createConnection(config);
 * Table table = connection.getTable(TableName.valueOf("table1"));
 * try {
 *   // Use the table as needed, for a single operation and a single thread
 * } finally {
 *   table.close();
 *   connection.close();
 * }
 * </pre>
 *
 * Similarly, {@link Connection} also returns {@link Admin} and {@link RegionLocator}
 * implementations.
 *
 * This class replaces {@link HConnectionManager}, which is now deprecated.
 * @see Connection
 * @since 0.99.0
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class ConnectionFactory {

  /** No public c.tors */
  protected ConnectionFactory() {
  }

  /**
   * Create a new Connection instance using default HBaseConfiguration. Connection
   * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces
   * created from returned connection share zookeeper connection, meta cache, and connections
   * to region servers and masters.
   * <br>
   * The caller is responsible for calling {@link Connection#close()} on the returned
   * connection instance.
   *
   * Typical usage:
   * <pre>
   * Connection connection = ConnectionFactory.createConnection();
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * </pre>
   *
   * @return Connection object for <code>conf</code>
   */
  public static Connection createConnection() throws IOException {
    return createConnection(HBaseConfiguration.create(), null, null);
  }

  /**
   * Create a new Connection instance using the passed <code>conf</code> instance. Connection
   * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces
   * created from returned connection share zookeeper connection, meta cache, and connections
   * to region servers and masters.
   * <br>
   * The caller is responsible for calling {@link Connection#close()} on the returned
   * connection instance.
   *
   * Typical usage:
   * <pre>
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * </pre>
   *
   * @param conf configuration
   * @return Connection object for <code>conf</code>
   */
  public static Connection createConnection(Configuration conf) throws IOException {
    return createConnection(conf, null, null);
  }

  /**
   * Create a new Connection instance using the passed <code>conf</code> instance. Connection
   * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces
   * created from returned connection share zookeeper connection, meta cache, and connections
   * to region servers and masters.
   * <br>
   * The caller is responsible for calling {@link Connection#close()} on the returned
   * connection instance.
   *
   * Typical usage:
   * <pre>
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * </pre>
   *
   * @param conf configuration
   * @param pool the thread pool to use for batch operations
   * @return Connection object for <code>conf</code>
   */
  public static Connection createConnection(Configuration conf, ExecutorService pool)
      throws IOException {
    return createConnection(conf, pool, null);
  }

  /**
   * Create a new Connection instance using the passed <code>conf</code> instance. Connection
   * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces
   * created from returned connection share zookeeper connection, meta cache, and connections
   * to region servers and masters.
   * <br>
   * The caller is responsible for calling {@link Connection#close()} on the returned
   * connection instance.
   *
   * Typical usage:
   * <pre>
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * </pre>
   *
   * @param conf configuration
   * @param user the user the connection is for
   * @return Connection object for <code>conf</code>
   */
  public static Connection createConnection(Configuration conf, User user)
  throws IOException {
    return createConnection(conf, null, user);
  }

  /**
   * Create a new Connection instance using the passed <code>conf</code> instance. Connection
   * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces
   * created from returned connection share zookeeper connection, meta cache, and connections
   * to region servers and masters.
   * <br>
   * The caller is responsible for calling {@link Connection#close()} on the returned
   * connection instance.
   *
   * Typical usage:
   * <pre>
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * </pre>
   *
   * @param conf configuration
   * @param user the user the connection is for
   * @param pool the thread pool to use for batch operations
   * @return Connection object for <code>conf</code>
   */
  public static Connection createConnection(Configuration conf, ExecutorService pool, User user)
  throws IOException {
    if (user == null) {
      UserProvider provider = UserProvider.instantiate(conf);
      user = provider.getCurrent();
    }

    return createConnection(conf, false, pool, user);
  }

  static Connection createConnection(final Configuration conf, final boolean managed,
      final ExecutorService pool, final User user)
  throws IOException {
    String className = conf.get(HConnection.HBASE_CLIENT_CONNECTION_IMPL,
      ConnectionManager.HConnectionImplementation.class.getName());
    Class<?> clazz = null;
    try {
      clazz = Class.forName(className);
    } catch (ClassNotFoundException e) {
      throw new IOException(e);
    }
    try {
      // Default HCM#HCI is not accessible; make it so before invoking.
      Constructor<?> constructor =
        clazz.getDeclaredConstructor(Configuration.class,
          boolean.class, ExecutorService.class, User.class);
      constructor.setAccessible(true);
      return (Connection) constructor.newInstance(conf, managed, pool, user);
    } catch (Exception e) {
      throw new IOException(e);
    }
  }
}


作业五 安装并运用redis
使用·wget命令安装并使用redis即可

作业六 安装使用hive
一:安装mysql
下载安装包wget http://dev.mysql.com/get/mysql-community-release-el7-5.noarch.rpm
解压:rpm -ivh mysql-community-release-el7-5.noarch.rpm
安装yum install mysql-community-server
重启mysql服务:service mysqld restart
mysql -u root
为root用户设置密码root:mysql> set password for ‘root’@‘localhost’ =password(‘root’);
配置文件/etc/my.cnf加上编码配置:[mysql] default-character-set =utf8
grant all privileges on . to root@’ %'identified by ‘root’;
flush privileges; 刷新权限
二:hive的安装及配置
官网下载安装包:http://mirror.bit.edu.cn/apache/hive/ 并上传到虚拟机
解压安装到指定目录下/opt/module
修改etc/profile文件,添加HIVE_HOME安装路径
Source profile,使其生效
配置hive-env.sh


cp hive-env.sh.template  hive-env.sh
修改Hadoop的安装路径
HADOOP_HOME=/opt/module /hadoop-2.7.3
修改Hive的conf目录的路径
export HIVE_CONF_DIR=/opt/module/hive/conf


配置hive-site.xml

 <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://127.0.0.1:3306/hive?characterEncoding=UTF-8&amp;serverTimezone=GMT%2B8</value>
    <description>
      JDBC connect string for a JDBC metastore.
      To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL.
      For example, jdbc:postgresql://myhost/db?ssl=true for postgres database.
    </description>
  </property>
 
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.cj.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
 
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>Username to use against metastore database</description>
  </property>
 
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123456</value>
    <description>password to use against metastore database</description>
  </property>

 <property>
    <name>hive.exec.local.scratchdir</name>
    <value>/usr/local/hive/apache-hive-2.3.4-bin/tmp/${user.name}</value>
    <description>Local scratch space for Hive jobs</description>
  </property>
 
  <property>
    <name>hive.downloaded.resources.dir</name>
    <value>/usr/local/hive/apache-hive-2.3.4-bin/iotmp/${hive.session.id}_resources</value>
    <description>Temporary local directory for added resources in the remote file system.</description>
  </property>
 
  <property>
    <name>hive.querylog.location</name>
    <value>/usr/local/hive/apache-hive-2.3.4-bin/iotmp/${system:user.name}</value>
    <description>Location of Hive run time structured log file</description>
  </property>
 
  <property>
    <name>hive.server2.logging.operation.log.location</name>
    <value>/usr/local/hive/apache-hive-2.3.4-bin/iotmp/${system:user.name}/operation_logs</value>
    <description>Top level directory where operation logs are stored if logging functionality is enabled</description>
  </property>
 
  <property>
    <name>hive.server2.thrift.bind.host</name>
    <value>bigdata</value>
    <description>Bind host on which to run the HiveServer2 Thrift service.</description>
  </property>
 
  <property>
    <name>system:java.io.tmpdir</name>
    <value>/usr/local/hive/apache-hive-2.3.4-bin/iotmp</value>
    <description/>
  </property>

初始化并重启运行即可。

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

大数据学习心得与总结 的相关文章

随机推荐