使用 R 语言下载和处理日度夜间灯光栅格数据（二）

学术 2024-10-17 18:01 云南

上次直播课没有讲完，欢迎各位培训班会员参加明晚 8 点的直播课：「使用 R 语言下载和处理日度夜间灯光栅格数据（二）」。

今天的课程中我们将一起学习如何下载和处理 NASA 的日度夜间灯光栅格数据。

日度夜间灯光的原始数据可以从这里下载：https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/。

关于该数据的介绍可以阅读这个文档：https://ladsweb.modaps.eosdis.nasa.gov/missions-and-measurements/products/VNP46A2/#overview。

该数据把全球根据经纬度划按照 10˚x10˚ 的网格进行了划分，每个分块又包含了 2400x2400 个格点，分辨率大致为 500mx500m，数据均为 HDF5 格式。因此处理该数据的第一步就是要先找到中国所处的区块文件。

上图来自：https://ladsweb.modaps.eosdis.nasa.gov/api/v2/content/archives/Document%20Archive/Science%20Data%20Product%20Documentation/Product%20Generation%20Algorithms/VIIRS_Black_Marble_ATBD_v1.1_July_2020.pdf

所以我们的下载和处理思路是：

获取所有的 HDF5 文件的链接；
筛选包含中国范围的文件；
下载；
处理 HDF5 文件得到 tiff 格式的栅格数据。

获取所有的 HDF5 文件的链接

这里我们可以通过网络数据爬取的方式获取。

首先爬取首页：

library(tidyverse)
library(rvest)
# https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/
# 爬取第一层：VNP46A2 
read_html("https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/") -> html

提取所有的子文件夹链接：

html %>% 
  html_elements("tr") %>% 
  html_attr("data-href") %>% 
  as_tibble() %>% 
  filter(str_detect(value, "VNP46A2")) -> fldf 

fldf

#> # A tibble: 13 × 1
#>    value                             
#>    <chr>                             
#>  1 /archive/allData/5000/VNP46A2/2012
#>  2 /archive/allData/5000/VNP46A2/2013
#>  3 /archive/allData/5000/VNP46A2/2014
#>  4 /archive/allData/5000/VNP46A2/2015
#>  5 /archive/allData/5000/VNP46A2/2016
#>  6 /archive/allData/5000/VNP46A2/2017
#>  7 /archive/allData/5000/VNP46A2/2018
#>  8 /archive/allData/5000/VNP46A2/2019
#>  9 /archive/allData/5000/VNP46A2/2020
#> 10 /archive/allData/5000/VNP46A2/2021
#> 11 /archive/allData/5000/VNP46A2/2022
#> 12 /archive/allData/5000/VNP46A2/2023
#> 13 /archive/allData/5000/VNP46A2/2024

创建文件夹保存下一轮结果：

dir.create("htmldir1")

fldf 中的链接都是相对链接，我们再生成完整的网址链接：

fldf %>% 
  mutate(value = paste0("https://ladsweb.modaps.eosdis.nasa.gov", value)) -> fldf2

下面需要爬取每个链接获取子文件夹的链接：

read_html(fldf2$value[1]) -> html2

html2 %>% 
  html_elements("tr") %>% 
  html_attr("data-href") %>% 
  as_tibble() %>% 
  slice(-1, -2)

#> # A tibble: 343 × 1
#>    value                                 
#>    <chr>                                 
#>  1 /archive/allData/5000/VNP46A2/2012/019
#>  2 /archive/allData/5000/VNP46A2/2012/020
#>  3 /archive/allData/5000/VNP46A2/2012/021
#>  4 /archive/allData/5000/VNP46A2/2012/022
#>  5 /archive/allData/5000/VNP46A2/2012/023
#>  6 /archive/allData/5000/VNP46A2/2012/024
#>  7 /archive/allData/5000/VNP46A2/2012/025
#>  8 /archive/allData/5000/VNP46A2/2012/026
#>  9 /archive/allData/5000/VNP46A2/2012/027
#> 10 /archive/allData/5000/VNP46A2/2012/028
#> # ℹ 333 more rows

然后就可以循环所有的了，为了更高效，我们可以采用多线程的方式：

dir.create("html2")
library(parallel)
makeCluster(16) -> cl
parLapply(cl, fldf2$value, function(x){
  if(!file.exists(paste0(stringr::str_remove(x, "https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/"), "/index.html"))) {
    download.file(x, paste0(stringr::str_remove(x, "https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/"), "/index.html")) 
  }
}) -> res

如果有下载失败的可以删除下载不完整的，然后再运行上面的代码，直到所有年份的 html 文件都下载成功。

然后就可以批量处理这些 html 文件提取下载链接了：

fs::dir_ls("htmldir1") %>% 
  as.character() %>% 
  as_tibble() -> fldf3

read_html(fldf3$value[1]) -> html3
html3 %>% 
  html_elements("tr") %>% 
  html_attr("data-href") %>% 
  as_tibble() %>% 
  slice(-1, -2)

#> # A tibble: 343 × 1
#>    value                                 
#>    <chr>                                 
#>  1 /archive/allData/5000/VNP46A2/2012/019
#>  2 /archive/allData/5000/VNP46A2/2012/020
#>  3 /archive/allData/5000/VNP46A2/2012/021
#>  4 /archive/allData/5000/VNP46A2/2012/022
#>  5 /archive/allData/5000/VNP46A2/2012/023
#>  6 /archive/allData/5000/VNP46A2/2012/024
#>  7 /archive/allData/5000/VNP46A2/2012/025
#>  8 /archive/allData/5000/VNP46A2/2012/026
#>  9 /archive/allData/5000/VNP46A2/2012/027
#> 10 /archive/allData/5000/VNP46A2/2012/028
#> # ℹ 333 more rows

# 循环所有的
fldf3 %>% 
  rename(a = value) %>% 
  mutate(data = map(a, function(x){
    read_html(x) %>% 
      html_elements("tr") %>% 
      html_attr("data-href") %>% 
      as_tibble() %>% 
      slice(-1, -2)
  })) %>% 
  unnest(data) %>% 
  select(-a) -> fldf4 

fldf4

#> # A tibble: 4,520 × 1
#>    value                                 
#>    <chr>                                 
#>  1 /archive/allData/5000/VNP46A2/2012/019
#>  2 /archive/allData/5000/VNP46A2/2012/020
#>  3 /archive/allData/5000/VNP46A2/2012/021
#>  4 /archive/allData/5000/VNP46A2/2012/022
#>  5 /archive/allData/5000/VNP46A2/2012/023
#>  6 /archive/allData/5000/VNP46A2/2012/024
#>  7 /archive/allData/5000/VNP46A2/2012/025
#>  8 /archive/allData/5000/VNP46A2/2012/026
#>  9 /archive/allData/5000/VNP46A2/2012/027
#> 10 /archive/allData/5000/VNP46A2/2012/028
#> # ℹ 4,510 more rows

循环所有的链接再爬取下一层的：

fldf4 %>% 
  mutate(value = paste0("https://ladsweb.modaps.eosdis.nasa.gov", value)) %>% 
  mutate(fid = row_number()) -> fldf4

dir.create("htmldir2")

# 这里选择 2023 年的为例进行下载：
fldf4 %>% 
  filter(str_detect(value, "2023")) -> fldf4_2023
clusterExport(cl, "fldf4_2023")
parLapply(cl, 1:nrow(fldf4_2023), function(x){
  if(!file.exists(paste0("htmldir2/", fldf4_2023$fid[x], ".html"))) {
    try({
      download.file(fldf4_2023$value[x], paste0("htmldir2/", fldf4_2023$fid[x], ".html"))
    })
  }
}) -> tempres

按照下载得到的文件大小排序，检查哪些是下载不完整的，删除之后再重新运行上面的代码，直到所有的文件都下载成功。

然后就可以处理得到的 html 文件获取所有的 HDF5 文件的链接了：

fs::dir_ls("htmldir2") %>% 
  as_tibble() %>% 
  rename(file = value) %>% 
  mutate(data = map(file, function(x){
    read_html(x) %>% 
      html_elements("tr") %>% 
      html_attr("data-path") %>% 
      as_tibble() %>% 
      slice(-1, -2) 
  })) -> fldf5

fldf5

#> # A tibble: 730 × 2
#>    file               data              
#>    <fs::path>         <list>            
#>  1 htmldir2/1059.html <tibble [346 × 1]>
#>  2 htmldir2/1060.html <tibble [345 × 1]>
#>  3 htmldir2/1061.html <tibble [346 × 1]>
#>  4 htmldir2/1062.html <tibble [346 × 1]>
#>  5 htmldir2/1063.html <tibble [346 × 1]>
#>  6 htmldir2/1064.html <tibble [346 × 1]>
#>  7 htmldir2/1065.html <tibble [346 × 1]>
#>  8 htmldir2/1066.html <tibble [346 × 1]>
#>  9 htmldir2/1067.html <tibble [346 × 1]>
#> 10 htmldir2/1068.html <tibble [346 × 1]>
#> # ℹ 720 more rows

fldf5 %>% 
  unnest(data) %>% 
  select(-file) %>% 
  mutate(value = paste0("https://ladsweb.modaps.eosdis.nasa.gov", value)) -> fldf6 

fldf6$value[1]

#> [1] "https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001/VNP46A2.A2023001.h00v01.001.2023012002938.h5"

下面就可以下载这些文件了，不过这些链接并不能使用 download.file() 直接下载，而是需要使用 NASA 规定的方法。

先保存下：

fldf6 %>% 
  write_rds("fldf6.rds")

筛选含中国范围的文件

正如开头所说，该数据是把全球的数据划分成了一个个小格子，下面我们需要找到与中国范围相交的。

library(tidyverse)
library(terra)
rast(nrows = 18, ncols = 36, xmin=-180, xmax=180, 
     ymin=-90, ymax=90, crs = "+proj=longlat +datum=WGS84 +no_defs") -> rst
rst[] <- 1:648
rst

#> class       : SpatRaster 
#> dimensions  : 18, 36, 1  (nrow, ncol, nlyr)
#> resolution  : 10, 10  (x, y)
#> extent      : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
#> coord. ref. : +proj=longlat +datum=WGS84 +no_defs 
#> source(s)   : memory
#> name        : lyr.1 
#> min value   :     1 
#> max value   :   648

plot(rst)

一共是 648 个格子，我给每个格子都进行了编号 1～648。

合并省级数据得到中国的范围：

library(sf)
read_sf("2021行政区划/省.shp") -> prov 
st_union(prov) %>% 
  nngeo::st_remove_holes() -> cn 
plot(cn)

把 rst 转换成 polygon 对象：

library(raster)
rst %>% 
  raster() %>% 
  rasterToPolygons() %>% 
  st_as_sf() -> rstdf

rstdf

#> Simple feature collection with 648 features and 1 field
#> Geometry type: POLYGON
#> Dimension:     XY
#> Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 90
#> Geodetic CRS:  +proj=longlat +datum=WGS84 +no_defs
#> First 10 features:
#>    lyr.1                       geometry
#> 1      1 POLYGON ((-180 90, -170 90,...
#> 2      2 POLYGON ((-170 90, -160 90,...
#> 3      3 POLYGON ((-160 90, -150 90,...
#> 4      4 POLYGON ((-150 90, -140 90,...
#> 5      5 POLYGON ((-140 90, -130 90,...
#> 6      6 POLYGON ((-130 90, -120 90,...
#> 7      7 POLYGON ((-120 90, -110 90,...
#> 8      8 POLYGON ((-110 90, -100 90,...
#> 9      9 POLYGON ((-100 90, -90 90, ...
#> 10    10 POLYGON ((-90 90, -80 90, -...

提取和中国相交的：

rstdf %>% 
  st_intersection(cn) -> res

这些编号对应的格子组成的范围正好涵盖了中国的区域：

res %>% 
  dplyr::select(value = lyr.1) -> ressf
ressf

#> Simple feature collection with 24 features and 1 field
#> Geometry type: GEOMETRY
#> Dimension:     XY
#> Bounding box:  xmin: 73.50235 ymin: 3.83703 xmax: 135.0957 ymax: 53.56362
#> Geodetic CRS:  +proj=longlat +datum=WGS84 +no_defs
#> First 10 features:
#>     value                       geometry
#> 138   138 POLYGON ((119.1566 50.03325...
#> 139   139 POLYGON ((120 51.51333, 120...
#> 170   170 MULTIPOLYGON (((74.38025 40...
#> 171   171 POLYGON ((80 42.04188, 80 4...
#> 172   172 POLYGON ((90 47.88057, 90 4...
#> 173   173 POLYGON ((100 42.64699, 100...
#> 174   174 POLYGON ((110 42.63796, 110...
#> 175   175 MULTIPOLYGON (((127.5189 50...
#> 176   176 POLYGON ((130 49.02151, 130...
#> 206   206 POLYGON ((80 30.67492, 80 4...

我们注意到每个 h5 文件的文字里面都有类似 h00v01 的编号：

fldf6$value[1]

#> [1] "https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001/VNP46A2.A2023001.h00v01.001.2023012002938.h5"

下面我们还需要找到 ressf 数据中的编号与 hv 的对应关系：

0:17 %>% 
  crossing(0:35) %>% 
  set_names("v", "h") %>% 
  arrange(v, h) %>% 
  mutate_all(~sprintf("%02d", .x)) %>% 
  mutate(names = paste0("h", h, "v", v)) %>% 
  mutate(value = 1:648) -> tabdf

tabdf

#> # A tibble: 648 × 4
#>    v     h     names  value
#>    <chr> <chr> <chr>  <int>
#>  1 00    00    h00v00     1
#>  2 00    01    h01v00     2
#>  3 00    02    h02v00     3
#>  4 00    03    h03v00     4
#>  5 00    04    h04v00     5
#>  6 00    05    h05v00     6
#>  7 00    06    h06v00     7
#>  8 00    07    h07v00     8
#>  9 00    08    h08v00     9
#> 10 00    09    h09v00    10
#> # ℹ 638 more rows

所以筛选这些格子的正则表达式就是这个：

ressf %>% 
  inner_join(tabdf) %>% 
  pull(names) %>% 
  paste0(collapse = "|") -> regexp

regexp

#> [1] "h29v03|h30v03|h25v04|h26v04|h27v04|h28v04|h29v04|h30v04|h31v04|h25v05|h26v05|h27v05|h28v05|h29v05|h30v05|h26v06|h27v06|h28v06|h29v06|h30v06|h28v07|h29v07|h28v08|h29v08"

然后就可以根据这个正则表达式筛选中国的格子了：

fldf6 %>% 
  filter(str_detect(value, regexp)) -> cnfldf6 

cnfldf6

#> # A tibble: 17,322 × 1
#>    value                                                                        
#>    <chr>                                                                        
#>  1 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  2 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  3 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  4 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  5 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  6 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  7 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  8 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#>  9 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#> 10 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46A2/2023/001…
#> # ℹ 17,312 more rows

由于时间关系，我们只下载前 5 天的数据：

# 提取日期
cnfldf6 %>% 
  mutate(date = str_extract(value, "A\\d{7}")) %>% 
  mutate(date = str_remove_all(date, "A")) %>% 
  mutate(year = as.numeric(str_sub(date, 1, 4)),
         day = as.numeric(str_sub(date, 5, 7)),
         newdate = ymd(paste0(year, "-01-01")) + day - 1) %>% 
  dplyr::select(-date, -year, -day) -> cnfldf62

# 选择前 5 天的
cnfldf62 %>% 
  filter(newdate <= ymd("2023-01-05")) -> cnfldf7

cnfldf7

#> # A tibble: 240 × 2
#>    value                                                              newdate   
#>    <chr>                                                              <date>    
#>  1 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  2 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  3 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  4 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  5 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  6 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  7 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  8 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#>  9 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#> 10 https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5000/VNP46… 2023-01-01
#> # ℹ 230 more rows

下载 HDF5 文件

下面就可以逐一下载这些文件了。

NASA 推荐的下载方法是使用 wget，Mac 系统自带这个命令，不过 Windows 电脑需要安装，安装方法可以参考：https://ladsweb.modaps.eosdis.nasa.gov/tools-and-services/data-download-scripts/#wget ，另外也可以百度一下 wget 的使用。这个命令在 Mac 上是在“终端” App 上运行的，在 Windows 上是在命令行工具（按 win+R 输入 cmd 回车就可以打开了）里面运行的。

另外使用 httr 包下载也是可以的，试试一个：

library(httr)
GET(cnfldf7$value[1], 
    add_headers(c(
      Authorization = "Bearer eyJ0eXAiOiJKV1QiLCJvcmlnaW4iOiJFYXJ0aGRhdGEgTG9naW4iLCJzaWciOiJlZGxqd3RwdWJrZXlfb3BzIiwiYWxnIjoiUlMyNTYifQ.eyJ0eXBlIjoiVXNlciIsInVpZCI6ImN6eGEiLCJleHAiOjE3MjI3NTAzNjAsImlhdCI6MTcxNzU2NjM2MCwiaXNzIjoiRWFydGhkYXRhIExvZ2luIn0.nJalDlQx31wHllEyIBsFMMjLhRA8tCEV9zaSC3q5K4JrY5Gem5m622NDRqQoF_R7w1KksN4cLmVJroilLH0zKeLRUX4vHS97jy008zRNuNoBcAhwXBeXue6scUsWsAfYre3Y-u61HNswQy0w6emOa5IbkG11Kc6umbHXh9XOHj2R5eOcXeSkiAHKmFv4LZSQXAkbJV_UIzcE-WLzxI3gyQnNrl_DVyfZonQM2FkZlSpc7Gt--Tn7pL03kKgb_Qz9KGhLxt7RfHqt0icgc4lnHjIaRW1xXS9G490xPyqlKWwuc7tfYL0Ymq7Kky7KP_xft9Jn-wWbuR5bvpFIoafeuw"
    ))) -> getres

其中 eyJ...afeuw 是我的 token，大家可以从这里生成和查看自己的 token：https://urs.earthdata.nasa.gov/profile （Generate Token 页面）

保存为 h5 文件：

content(getres) %>% 
  writeBin("temp.h5")

然后就可以循环下载所有的 h5 文件了：

dir.create("h5data")
parLapply(cl, cnfldf7$value, function(x){
  if (!file.exists(paste0("h5data/", basename(x)))) {
    httr::GET(x, 
              httr::add_headers(c(
                Authorization = "Bearer eyJ0eXAiOiJKV1QiLCJvcmlnaW4iOiJFYXJ0aGRhdGEgTG9naW4iLCJzaWciOiJlZGxqd3RwdWJrZXlfb3BzIiwiYWxnIjoiUlMyNTYifQ.eyJ0eXBlIjoiVXNlciIsInVpZCI6ImN6eGEiLCJleHAiOjE3MjI3NTAzNjAsImlhdCI6MTcxNzU2NjM2MCwiaXNzIjoiRWFydGhkYXRhIExvZ2luIn0.nJalDlQx31wHllEyIBsFMMjLhRA8tCEV9zaSC3q5K4JrY5Gem5m622NDRqQoF_R7w1KksN4cLmVJroilLH0zKeLRUX4vHS97jy008zRNuNoBcAhwXBeXue6scUsWsAfYre3Y-u61HNswQy0w6emOa5IbkG11Kc6umbHXh9XOHj2R5eOcXeSkiAHKmFv4LZSQXAkbJV_UIzcE-WLzxI3gyQnNrl_DVyfZonQM2FkZlSpc7Gt--Tn7pL03kKgb_Qz9KGhLxt7RfHqt0icgc4lnHjIaRW1xXS9G490xPyqlKWwuc7tfYL0Ymq7Kky7KP_xft9Jn-wWbuR5bvpFIoafeuw"
              ))) -> getres 
    writeBin(content(getres), paste0("h5data/", basename(x)))
  }
}) -> tempres

处理 h5 文件得到栅格数据

h5 文件可以使用 rhdf5 包处理，该包的安装方法：

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("rhdf5")

读取：

library(rhdf5)
library(terra)
# 查看所有的层
rhdf5::h5ls("temp.h5") %>% 
  as_tibble()

#> # A tibble: 15 × 5
#>    group                                  name                otype dclass dim  
#>    <chr>                                  <chr>               <chr> <chr>  <chr>
#>  1 /                                      HDFEOS              H5I_… ""     ""   
#>  2 /HDFEOS                                ADDITIONAL          H5I_… ""     ""   
#>  3 /HDFEOS/ADDITIONAL                     FILE_ATTRIBUTES     H5I_… ""     ""   
#>  4 /HDFEOS                                GRIDS               H5I_… ""     ""   
#>  5 /HDFEOS/GRIDS                          VNP_Grid_DNB        H5I_… ""     ""   
#>  6 /HDFEOS/GRIDS/VNP_Grid_DNB             Data Fields         H5I_… ""     ""   
#>  7 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields DNB_BRDF-Corrected… H5I_… "INTE… "240…
#>  8 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields DNB_Lunar_Irradian… H5I_… "INTE… "240…
#>  9 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields Gap_Filled_DNB_BRD… H5I_… "INTE… "240…
#> 10 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields Latest_High_Qualit… H5I_… "INTE… "240…
#> 11 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields Mandatory_Quality_… H5I_… "INTE… "240…
#> 12 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields QF_Cloud_Mask       H5I_… "INTE… "240…
#> 13 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields Snow_Flag           H5I_… "INTE… "240…
#> 14 /                                      HDFEOS INFORMATION  H5I_… ""     ""   
#> 15 /HDFEOS INFORMATION                    StructMetadata.0    H5I_… "STRI… "( 0…

我们主要需要的是 Gap-Filled DNB BRDF-Corrected NTL 层，该层的名字是 /HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields/Gap_Filled_DNB_BRDF-Corrected_NTL (group 也要带上)

# 读取该层的数据
rhdf5::h5read("temp.h5", name = "/HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields/Gap_Filled_DNB_BRDF-Corrected_NTL") -> mat

# 得到的是个矩阵下面我们再把它转换成栅格数据
dim(mat)

#> [1] 2400 2400

矩阵数据转换成栅格数据需要设定坐标系和范围。所有的数据坐标系都是 +proj=longlat +datum=WGS84 +no_defs，但是范围却是不同的。下面我们再生成各个格子的范围数据：

# 计算每个格子的 bounding box：
rstdf %>% 
  mutate(value = 1:648) %>% 
  inner_join(ressf %>% st_drop_geometry()) %>% 
  inner_join(tabdf) %>% 
  dplyr::select(names) %>% 
  mutate(bbox = map(geometry, ~st_bbox(.x))) %>% 
  st_drop_geometry() %>% 
  as_tibble() -> bboxdf

bboxdf

#> # A tibble: 24 × 2
#>    names  bbox      
#>    <chr>  <list>    
#>  1 h29v03 <bbox [4]>
#>  2 h30v03 <bbox [4]>
#>  3 h25v04 <bbox [4]>
#>  4 h26v04 <bbox [4]>
#>  5 h27v04 <bbox [4]>
#>  6 h28v04 <bbox [4]>
#>  7 h29v04 <bbox [4]>
#>  8 h30v04 <bbox [4]>
#>  9 h31v04 <bbox [4]>
#> 10 h25v05 <bbox [4]>
#> # ℹ 14 more rows

# 保存
bboxdf %>% 
  write_rds("bboxdf.rds")

这样我们就可以根据 hv 编号给不同的栅格数据设定不同的范围了。

处理 h5 文件得到栅格数据

首先索引所有的 h5 文件并生成日期：

fs::dir_ls("h5data") %>% 
  as.character() %>% 
  as_tibble() %>% 
  mutate(date = str_extract(value, "A\\d{7}")) %>% 
  mutate(date = str_remove_all(date, "A")) %>% 
  mutate(year = as.numeric(str_sub(date, 1, 4)),
         day = as.numeric(str_sub(date, 5, 7)),
         newdate = ymd(paste0(year, "-01-01")) + day - 1) %>% 
  dplyr::select(-date, -year, -day) -> h5fl

例如 2023 年 1 月 1 号的：

h5fl %>% 
  filter(newdate == ymd("2023-01-01")) %>% 
  pull(value) -> files

读取为栅格数据：

lapply(files, function(x){
  rhdf5::h5read(x, name = "/HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields/Gap_Filled_DNB_BRDF-Corrected_NTL") -> mat
  mat[mat > 65534] <- NA
  terra::rast(mat) %>% 
    trans() %>% 
    `crs<-`("+proj=longlat +datum=WGS84 +no_defs") 
}) -> rstlist

这里出现了 65534 这个数字。从该数据的介绍文档的第 50 页可以看到：https://ladsweb.modaps.eosdis.nasa.gov/api/v2/content/archives/Document%20Archive/Science%20Data%20Product%20Documentation/VIIRS_Black_Marble_UG_v1.3_Sep_2022.pdf，该夜间灯光数据有有效值是 0-65534，所以超过 65534 的被设定为了 NA。

提取 hv 编号并和 bboxdf 合并：

str_extract(files, "h\\d{2}v\\d{2}") %>% 
  as_tibble() %>% 
  set_names("names") %>% 
  left_join(bboxdf) %>% 
  pull(bbox) -> bboxlist

然后循环给 rstlist 里栅格数据设定正确的范围：

lapply(1:length(files), function(x){
  c(bboxlist[[x]]) -> bboxtmp
  rstlist[[x]] %>% 
    `ext<-`(c(bboxtmp["xmin"], bboxtmp["xmax"],
              bboxtmp["ymin"], bboxtmp["ymax"]))
}) %>% 
  sprc() %>% 
  mosaic() -> rstall

rstall

#> class       : SpatRaster 
#> dimensions  : 14400, 16800, 1  (nrow, ncol, nlyr)
#> resolution  : 0.004166667, 0.004166667  (x, y)
#> extent      : 70, 140, 0, 60  (xmin, xmax, ymin, ymax)
#> coord. ref. : +proj=longlat +datum=WGS84 +no_defs 
#> source(s)   : memory
#> name        : lyr.1 
#> min value   :     0 
#> max value   : 60644

sprc() 和 mosaic() 的组合可以列表里面存储的栅格数据们合并成一个大栅格数据（类似拼图），这种操作被称为“镶嵌”。

rstall 就是合并好的夜间灯光栅格数据了：

plot(log(rstall))

这样我们就处理好一天的数据了。循环所有天的数据即可：

dir.create("nightlight")

unique(h5fl$newdate) -> datelist 
clusterExport(cl, "h5fl")
clusterExport(cl, "bboxdf")
clusterEvalQ(cl, ({
  library(tidyverse)
  library(terra)
})) -> tempres 
parLapply(cl, datelist, function(x){
  if (!file.exists(paste0("nightlight/", x, ".tif"))){
    h5fl %>% 
      filter(newdate == ymd(x)) %>% 
      pull(value) -> files 
    
    # 读取为栅格数据
    lapply(files, function(x){
      rhdf5::h5read(x, name = "/HDFEOS/GRIDS/VNP_Grid_DNB/Data Fields/Gap_Filled_DNB_BRDF-Corrected_NTL") -> mat
      mat[mat>65534] <- NA
      terra::rast(mat) %>% 
        trans() %>% 
        `crs<-`("+proj=longlat +datum=WGS84 +no_defs") 
    }) -> rstlist
    
    # 提取 hv 编号并和 bboxdf 合并
    str_extract(files, "h\\d{2}v\\d{2}") %>% 
      as_tibble() %>% 
      set_names("names") %>% 
      left_join(bboxdf) %>% 
      pull(bbox) -> bboxlist
    
    lapply(1:length(files), function(x){
      c(bboxlist[[x]]) -> bboxtmp
      rstlist[[x]] %>% 
        `ext<-`(c(bboxtmp["xmin"], bboxtmp["xmax"],
                  bboxtmp["ymin"], bboxtmp["ymax"]))
    }) %>% 
      sprc() %>% 
      mosaic() %>% 
      `names<-`(x) %>% 
      writeRaster(paste0("nightlight/", x, ".tif"))
  }
}) -> res

再读取进来检查下：

rast("nightlight/2023-01-01.tif") -> rst
rst

#> class       : SpatRaster 
#> dimensions  : 14400, 16800, 1  (nrow, ncol, nlyr)
#> resolution  : 0.004166667, 0.004166667  (x, y)
#> extent      : 70, 140, 0, 60  (xmin, xmax, ymin, ymax)
#> coord. ref. : lon/lat WGS 84 (EPSG:4326) 
#> source      : 2023-01-01.tif 
#> name        : 2023-01-01 
#> min value   :          0 
#> max value   :      60644

如果只想要中国范围的，可以再用前面的 cn 裁剪下：

rst %>% 
  terra::crop(terra::vect(cn)) %>% 
  terra::mask(terra::vect(cn)) -> rst2

至于计算各省市区县的平均夜间灯光亮度这里就不再讲解了，感兴趣的小伙伴可以学习平台上的一些栅格转面板课程。

直播信息

为了让大家更好的理解本文内容，欢迎各位培训班会员参加明晚 8 点的直播课：「使用 R 语言下载和处理日度夜间灯光栅格数据」。

直播地址：腾讯会议(需要报名地理数据科学培训班参加)
需要报名地理数据科学培训班，详细阅读：一网打尽SCI论文数据获取、分析、可视化

更多关于地理数据科学培训班会员的更多信息可添加微信咨询：

附件下载（点击文末的阅读原文即可跳转）：
https://geodatasci.duanshu.com/#/course/6e679ebb823044548e80013367bfc133

http://mp.weixin.qq.com/s?__biz=MzI4OTU3NTY1OA==&mid=2247508859&idx=1&sn=c8c507450c3785b1809fd20c0af965ad

走天涯徐小洋地理数据科学

一个爱生活的地理土博，分享GIS、遥感、空间分析、R语言、景观生态等地理数据科学实操教程、经典文献、数据资源