Hive処理まとめ


色々迷走したので取り敢えず簡単なものだけピックアップ。

点は街区レベル位置情報:全国1千3百79万件ほど
面は市区町村界:1900レコードほど

データコピー

mount -t cifs //ipあどれす/shared /mnt -o username=ゆーざ,password=ぱす
cp -r /mnt/json_data /tmp
umount /mnt

sudo -u hdfs hadoop fs -put /tmp/json_data/jpn/japan_ver71.json /user/hive/jpn
sudo -u hdfs hadoop fs -put /tmp/json_data/addr/addrwkt.csv /user/hive/addr

sudo -u hdfs hadoop fs -chmod -R 777 /use

DB作成とライブラリ読み込み。

hive

ADD JAR /usr/lib/hive/lib/esri-geometry-api.jar;
ADD JAR /usr/lib/hive/lib/spatial-sdk-hadoop.jar;

SHOW DATABASES;

CREATE DATABASE IF NOT EXISTS test;

use test;
show tables;

create temporary function ST_Contains as 'com.esri.hadoop.hive.ST_Contains';
create temporary function ST_Point as 'com.esri.hadoop.hive.ST_Point';
create temporary function ST_Aggr_Union as 'com.esri.hadoop.hive.ST_Aggr_Union';
create temporary function ST_Distance as 'com.esri.hadoop.hive.ST_Distance';
create temporary function ST_AsText as 'com.esri.hadoop.hive.ST_AsText';

テーブル作成

CREATE EXTERNAL TABLE IF NOT EXISTS
 jpn71
  (
    JCODE string
   ,KEN string
   ,SICHO string
   ,GUN string
   ,SEIREI string
   ,SIKUCHOSON string
   ,CITY_ENG string
   ,P_NUM INT
   ,H_NUM INT
   ,SHAPE binary
  )
ROW FORMAT SERDE 'com.esri.hadoop.hive.serde.JsonSerde'              
STORED AS INPUTFORMAT 'com.esri.json.hadoop.EnclosedJsonInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/user/hive/jpn';

CREATE EXTERNAL TABLE IF NOT EXISTS 
addrwkt
  (
   KEN string
  ,SHIKUTYOSON string
  ,OOAZATYOU string
  ,GAIKU string
  ,KEI INT
  ,X DOUBLE
  ,Y DOUBLE
  ,lat DOUBLE
  ,lon DOUBLE
  ,JyukyoCode INT
  ,DaihyoCode INT
  ,PreCode INT
  ,PostCode INT
  ,WKT string
  ,ID INT
  )
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/addr'
TBLPROPERTIES ("skip.header.line.count"="1")
;

文字列検索

SELECT
 addr.KEN
 ,addr.SHIKUTYOSON 
 ,addr.OOAZATYOU
 ,addr.GAIKU
FROM
 addrwkt addr
WHERE
 addr.OOAZATYOU LIKE '%銀座%';


SELECT
 jpn.KEN
 ,jpn.SIKUCHOSON 
FROM
 jpn71 jpn
WHERE
 jpn.KEN LIKE '%島%';

空間検索

SELECT
 jpn71.SIKUCHOSON 
FROM
 jpn71
WHERE
 ST_Contains(jpn71.shape, ST_Point(135, 35));


SELECT
 jpn.SIKUCHOSON
 ,count(*) cnt 
FROM
 ( 
    SELECT
      jpn71.shape shp
     ,jpn71.SIKUCHOSON 
    FROM
     jpn71
    WHERE
     jpn71.KEN LIKE  '%兵庫%'
  ) jpn
JOIN
 addrwkt
WHERE
 ST_Contains(jpn.shp, ST_Point(addrwkt.lon, addrwkt.lat))
GROUP BY
 jpn.SIKUCHOSON
ORDER BY
 cnt desc;


SELECT
 addr.KEN
 ,addr.SHIKUTYOSON 
 ,addr.OOAZATYOU
 ,addr.GAIKU
 ,ST_Distance(ST_Point(135,35), ST_Point(addr.lon, addr.lat)) dist
FROM
 addrwkt addr
ORDER BY
 dist ASC
LIMIT 1
;


SELECT
 jpn.KEN
 ,count(*) cnt 
FROM
 ( 
    SELECT
      ST_Aggr_Union(jpn71.shape) shp
     ,jpn71.KEN
    FROM
     jpn71
    WHERE
     jpn71.KEN LIKE  '%兵庫%'
    GROUP BY
     jpn71.KEN
  ) jpn
JOIN
 addrwkt
WHERE
 ST_Contains(jpn.shp, ST_Point(addrwkt.lon, addrwkt.lat))
GROUP BY
 jpn.KEN
ORDER BY
 cnt desc;

データのローカルデータへの引き出し

INSERT OVERWRITE LOCAL DIRECTORY '/tmp/jpnhyogo' 
SELECT
  jpn71.KEN
 ,jpn71.SIKUCHOSON
 ,ST_AsText(jpn71.shape)
FROM
 jpn71
WHERE
 jpn71.KEN LIKE  '%兵庫%'
;

exit;

引き出したデータの確認

dir /tmp/

mount -t cifs //ipあどれす/shared /mnt -o username=ゆーざ,password=ぱす
cp -r /tmp/jpnhyogo /mnt/jpnhyogo
umount /mnt
カテゴリー: 開発, 設計 タグ: , パーマリンク