利用LineageLogger分析HiveQL中的字段级别血缘关系

  |   0 评论   |   6,806 浏览

概述

Hive提供了org.apache.hadoop.hive.ql.hooks.LineageLogger类,可以用来分析HiveQL中的字段级别血缘关系

配置

添加参数

vim /usr/local/hive/conf/hive-site.xml
<property>
    <name>hive.exec.post.hooks</name>
    <value>org.apache.hadoop.hive.ql.hooks.LineageLogger</value>
</property>

hive.exec.post.hooks参数介绍

执行后置条件。一个用逗号分隔开的实现了org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext接口的java class列表,配置了该参数后,每个hiveQL语句执行后都要执行这个钩子,默认是空;

修改日志级别

vim /usr/local/hive/conf/hive-log4j.properties

在文件末尾追加

log4j.logger.org.apache.hadoop.hive.ql.hooks.LineageLogger=INFO

默认日志输出在/tmp/${user.home}/hive.log

测试

目前hive支持以下四种语句的血缘分析

  • HiveOperation.QUERY

  • HiveOperation.CREATETABLE_AS_SELECT

  • HiveOperation.ALTERVIEW_AS

  • HiveOperation.CREATEVIEW

下面分别介绍

QUERY

执行以下HiveQL

SELECT z.zoneid AS zone_id,
        z.zonename AS zone_name,
         c.cityid AS city_id,
         c.cityname AS city_name
FROM dict_zoneinfo z
LEFT JOIN dict_cityinfo c
    ON z.cityid = c.cityid
        AND z.dt='20171109'
        AND c.dt='20171109'
WHERE z.dt='20171109'
        AND c.dt='20171109'
LIMIT 10;

查看/tmp/hadoop/hive.log,有以下输出

2017-11-10 18:02:50,775 INFO  [main]: hooks.LineageLogger (LineageLogger.java:run(193)) - {"version":"1.0","user":"hadoop","timestamp":1510308124,"duration":45959,"jobIds":["job_1509088410884_16752"],"engine":"mr","database":"cxy7_dw","hash":"a184be21aadb9dd5b6c950fe0b3298d8","queryText":"SELECT z.zoneid AS zone_id,z.zonename AS zone_name, c.cityid AS city_id, c.cityname AS city_name FROM dict_zoneinfo z LEFT JOIN dict_cityinfo c ON z.cityid = c.cityid AND z.dt='20171109' AND c.dt='20171109' WHERE z.dt='20171109' AND c.dt='20171109' LIMIT 10","edges":[{"sources":[4],"targets":[0],"edgeType":"PROJECTION"},{"sources":[5],"targets":[1],"edgeType":"PROJECTION"},{"sources":[6],"targets":[2],"edgeType":"PROJECTION"},{"sources":[7],"targets":[3],"edgeType":"PROJECTION"},{"sources":[8,6],"targets":[0,1,2,3],"expression":"(z.cityid = c.cityid)","edgeType":"PREDICATE"},{"sources":[9],"targets":[0,1,2,3],"expression":"(c.dt = '20171109')","edgeType":"PREDICATE"},{"sources":[10,9],"targets":[0,1,2,3],"expression":"((z.dt = '20171109') and (c.dt = '20171109'))","edgeType":"PREDICATE"}],"vertices":[{"id":0,"vertexType":"COLUMN","vertexId":"zone_id"},{"id":1,"vertexType":"COLUMN","vertexId":"zone_name"},{"id":2,"vertexType":"COLUMN","vertexId":"city_id"},{"id":3,"vertexType":"COLUMN","vertexId":"city_name"},{"id":4,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_zoneinfo.zoneid"},{"id":5,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_zoneinfo.zonename"},{"id":6,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_cityinfo.cityid"},{"id":7,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_cityinfo.cityname"},{"id":8,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_zoneinfo.cityid"},{"id":9,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_cityinfo.dt"},{"id":10,"vertexType":"COLUMN","vertexId":"cxy7_dw.dict_zoneinfo.dt"}]}

格式化一下

{
    "version": "1.0",
    "user": "hadoop",
    "timestamp": 1510308124,
    "duration": 45959,
    "jobIds": [
        "job_1509088410884_16752"
    ],
    "engine": "mr",
    "database": "cxy7_dw",
    "hash": "a184be21aadb9dd5b6c950fe0b3298d8",
    "queryText": "SELECT z.zoneid AS zone_id,z.zonename AS zone_name, c.cityid AS city_id, c.cityname AS city_name FROM dict_zoneinfo z LEFT JOIN dict_cityinfo c ON z.cityid = c.cityid AND z.dt='20171109' AND c.dt='20171109' WHERE z.dt='20171109' AND c.dt='20171109' LIMIT 10",
    "edges": [
        {
            "sources": [
                4
            ],
            "targets": [
                0
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                5
            ],
            "targets": [
                1
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                6
            ],
            "targets": [
                2
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                7
            ],
            "targets": [
                3
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                8,
                6
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(z.cityid = c.cityid)",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(c.dt = '20171109')",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                10,
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "((z.dt = '20171109') and (c.dt = '20171109'))",
            "edgeType": "PREDICATE"
        }
    ],
    "vertices": [
        {
            "id": 0,
            "vertexType": "COLUMN",
            "vertexId": "zone_id"
        },
        {
            "id": 1,
            "vertexType": "COLUMN",
            "vertexId": "zone_name"
        },
        {
            "id": 2,
            "vertexType": "COLUMN",
            "vertexId": "city_id"
        },
        {
            "id": 3,
            "vertexType": "COLUMN",
            "vertexId": "city_name"
        },
        {
            "id": 4,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zoneid"
        },
        {
            "id": 5,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zonename"
        },
        {
            "id": 6,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityid"
        },
        {
            "id": 7,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityname"
        },
        {
            "id": 8,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.cityid"
        },
        {
            "id": 9,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.dt"
        },
        {
            "id": 10,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.dt"
        }
    ]
}

格式说明

  • vertices:顶点。代表参与DAG的节点元素,vertexType有COLUMN和TABLE两个值

  • edges:边。代表DAG的流向,由sources指向targets,edgeType有PROJECTION(投影)和PREDICATE(谓语)两个值

CREATETABLE_AS_SELECT

执行以下HiveQL

CREATE TABLE tmp_zone_info AS
SELECT z.zoneid AS zone_id,
         z.zonename AS zone_name,
         c.cityid AS city_id,
         c.cityname AS city_name
FROM dict_zoneinfo z
LEFT JOIN dict_cityinfo c
    ON z.cityid = c.cityid
        AND z.dt='20171109'
        AND c.dt='20171109'
WHERE z.dt='20171109'
        AND c.dt='20171109';

对应的血缘关系

{
    "version": "1.0",
    "user": "hadoop",
    "timestamp": 1510307578,
    "duration": 30629,
    "jobIds": [
        "job_1509088410884_16739"
    ],
    "engine": "mr",
    "database": "cxy7_dw",
    "hash": "4484378cebc5e2b0b55fb34368d861b0",
    "queryText": "CREATE TABLE tmp_zone_info AS SELECT z.zoneid AS zone_id,z.zonename AS zone_name, c.cityid AS city_id, c.cityname AS city_name FROM dict_zoneinfo z LEFT JOIN dict_cityinfo c ON z.cityid = c.cityid AND z.dt='20171109' AND c.dt='20171109' WHERE z.dt='20171109' AND c.dt='20171109'",
    "edges": [
        {
            "sources": [
                4
            ],
            "targets": [
                0
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                5
            ],
            "targets": [
                1
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                6
            ],
            "targets": [
                2
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                7
            ],
            "targets": [
                3
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                8,
                6
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(z.cityid = c.cityid)",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(c.dt = '20171109')",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                10,
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "((z.dt = '20171109') and (c.dt = '20171109'))",
            "edgeType": "PREDICATE"
        }
    ],
    "vertices": [
        {
            "id": 0,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_zone_info.zone_id"
        },
        {
            "id": 1,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_zone_info.zone_name"
        },
        {
            "id": 2,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_zone_info.city_id"
        },
        {
            "id": 3,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_zone_info.city_name"
        },
        {
            "id": 4,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zoneid"
        },
        {
            "id": 5,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zonename"
        },
        {
            "id": 6,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityid"
        },
        {
            "id": 7,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityname"
        },
        {
            "id": 8,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.cityid"
        },
        {
            "id": 9,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.dt"
        },
        {
            "id": 10,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.dt"
        }
    ]
}

CREATEVIEW

执行以下HiveQL

CREATE VIEW tmp_view_zone_info ASSELECT z.zoneid AS zone_id,
        z.zonename AS zone_name,
         c.cityid AS city_id,
         c.cityname AS city_name
FROM dict_zoneinfo z
LEFT JOIN dict_cityinfo c
    ON z.cityid = c.cityid
        AND z.dt='20171109'
        AND c.dt='20171109'
WHERE z.dt='20171109'
        AND c.dt='20171109';

对应的血缘关系

{
    "version": "1.0",
    "user": "hadoop",
    "timestamp": 1510308427,
    "duration": 345,
    "jobIds": [],
    "engine": "mr",
    "database": "cxy7_dw",
    "hash": "6934c23c64e993ea42f57d3344125eb6",
    "queryText": "CREATE VIEW tmp_view_zone_info AS SELECT z.zoneid AS zone_id,z.zonename AS zone_name, c.cityid AS city_id, c.cityname AS city_name FROM dict_zoneinfo z LEFT JOIN dict_cityinfo c ON z.cityid = c.cityid AND z.dt='20171109' AND c.dt='20171109' WHERE z.dt='20171109' AND c.dt='20171109'",
    "edges": [
        {
            "sources": [
                4
            ],
            "targets": [
                0
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                5
            ],
            "targets": [
                1
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                6
            ],
            "targets": [
                2
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                7
            ],
            "targets": [
                3
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                8,
                6
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(z.cityid = c.cityid)",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "(c.dt = '20171109')",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                10,
                9
            ],
            "targets": [
                0,
                1,
                2,
                3
            ],
            "expression": "((z.dt = '20171109') and (c.dt = '20171109'))",
            "edgeType": "PREDICATE"
        }
    ],
    "vertices": [
        {
            "id": 0,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.zone_id"
        },
        {
            "id": 1,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.zone_name"
        },
        {
            "id": 2,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.city_id"
        },
        {
            "id": 3,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.city_name"
        },
        {
            "id": 4,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zoneid"
        },
        {
            "id": 5,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zonename"
        },
        {
            "id": 6,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityid"
        },
        {
            "id": 7,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityname"
        },
        {
            "id": 8,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.cityid"
        },
        {
            "id": 9,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.dt"
        },
        {
            "id": 10,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.dt"
        }
    ]
}

ALTERVIEW_AS

执行以下HiveQL

ALTER VIEW tmp_view_zone_info AS
SELECT z.zonename AS zone_name,
         c.cityname AS city_name
FROM dict_zoneinfo z
LEFT JOIN dict_cityinfo c
    ON z.cityid = c.cityid
        AND z.dt='20171109'
        AND c.dt='20171109'
WHERE z.dt='20171109'
        AND c.dt='20171109';

对应的血缘关系

{
    "version": "1.0",
    "user": "hadoop",
    "timestamp": 1510308583,
    "duration": 169,
    "jobIds": [],
    "engine": "mr",
    "database": "cxy7_dw",
    "hash": "5fbcee67de086e5b734244bfb40d1de6",
    "queryText": "ALTER VIEW tmp_view_zone_info AS SELECT z.zonename AS zone_name, c.cityname AS city_name FROM dict_zoneinfo z LEFT JOIN dict_cityinfo c ON z.cityid = c.cityid AND z.dt='20171109' AND c.dt='20171109' WHERE z.dt='20171109' AND c.dt='20171109'",
    "edges": [
        {
            "sources": [
                2
            ],
            "targets": [
                0
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                3
            ],
            "targets": [
                1
            ],
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                4,
                5
            ],
            "targets": [
                0,
                1
            ],
            "expression": "(z.cityid = c.cityid)",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                6
            ],
            "targets": [
                0,
                1
            ],
            "expression": "(c.dt = '20171109')",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                7,
                6
            ],
            "targets": [
                0,
                1
            ],
            "expression": "((z.dt = '20171109') and (c.dt = '20171109'))",
            "edgeType": "PREDICATE"
        }
    ],
    "vertices": [
        {
            "id": 0,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.zone_name"
        },
        {
            "id": 1,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.tmp_view_zone_info.city_name"
        },
        {
            "id": 2,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.zonename"
        },
        {
            "id": 3,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityname"
        },
        {
            "id": 4,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.cityid"
        },
        {
            "id": 5,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.cityid"
        },
        {
            "id": 6,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_cityinfo.dt"
        },
        {
            "id": 7,
            "vertexType": "COLUMN",
            "vertexId": "cxy7_dw.dict_zoneinfo.dt"
        }
    ]
}


读后有收获可以支付宝请作者喝咖啡