From 9d892ae65f2c8691e4b28512c5a750a6bab9b559 Mon Sep 17 00:00:00 2001 From: antaldaniel Date: Sun, 18 Nov 2018 15:54:41 +0100 Subject: [PATCH 1/2] documenting lat, lon --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/geo_join.R | 7 +++- man/geo_join.Rd | 99 +++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 103 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4080c8..593786e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,4 +34,4 @@ Suggests: maps, IRanges, covr -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index d635caa..9cdd734 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,4 +57,6 @@ export(stringdist_left_join) export(stringdist_right_join) export(stringdist_semi_join) importFrom(dplyr,"%>%") +importFrom(dplyr,common_by) +importFrom(dplyr,data_frame) importFrom(utils,data) diff --git a/R/geo_join.R b/R/geo_join.R index d3393b0..af54a78 100644 --- a/R/geo_join.R +++ b/R/geo_join.R @@ -8,7 +8,9 @@ #' #' @param x A tbl #' @param y A tbl -#' @param by Columns by which to join the two tables +#' @param by Columns by which to join the two tables. Must contain 'lon' +#' for longitute, and 'lat' for latitude, such as 'lat', or +#' 'lat.y', or 'lattitude'. #' @param max_dist Maximum distance to use for joining #' @param method Method to use for computing distance: one of #' "haversine" (default), "geo", "cosine", "meeus", "vincentysphere", @@ -29,6 +31,7 @@ #' \code{fuzzy_join}. #' #' @importFrom utils data +#' @importFrom dplyr common_by data_frame #' #' @examples #' @@ -74,7 +77,7 @@ geo_join <- function(x, y, by = NULL, max_dist, unit <- match.arg(unit) # make sure longitude and latitude are in the right order - by <- common_by(by, x, y) + by <- dplyr::common_by(by, x, y) by <- lapply(by, function(e) { if (length(e) != 2) { stop("Trying to join on ", paste(e, collapse = ", "), diff --git a/man/geo_join.Rd b/man/geo_join.Rd index 74681c3..7b06ed2 100644 --- a/man/geo_join.Rd +++ b/man/geo_join.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/geo_join.R +% Please edit documentation in R/geo_join.R, R/geo_join2.R \name{geo_join} \alias{geo_join} \alias{geo_inner_join} @@ -10,9 +10,31 @@ \alias{geo_anti_join} \title{Join two tables based on a geo distance of longitudes and latitudes} \usage{ -geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", "cosine", - "meeus", "vincentysphere", "vincentyellipsoid"), unit = c("miles", "km"), - mode = "inner", distance_col = NULL, ...) +geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", + "cosine", "meeus", "vincentysphere", "vincentyellipsoid"), + unit = c("miles", "km"), mode = "inner", distance_col = NULL, ...) + +geo_inner_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_left_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_right_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_full_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_semi_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_anti_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", + "cosine", "meeus", "vincentysphere", "vincentyellipsoid"), + unit = c("miles", "km"), mode = "inner", distance_col = NULL, ...) geo_inner_join(x, y, by = NULL, method = "haversine", max_dist = 1, distance_col = NULL, ...) @@ -37,6 +59,29 @@ geo_anti_join(x, y, by = NULL, method = "haversine", max_dist = 1, \item{y}{A tbl} +\item{by}{Columns by which to join the two tables. Must contain 'lon' +for longitute, and 'lat' for latitude, such as 'lat', or +'lat.y', or 'lattitude'.} + +\item{max_dist}{Maximum distance to use for joining} + +\item{method}{Method to use for computing distance: one of +"haversine" (default), "geo", "cosine", "meeus", "vincentysphere", +"vincentyellipsoid"} + +\item{unit}{Unit of distance for threshold (default "miles")} + +\item{mode}{One of "inner", "left", "right", "full" "semi", or "anti"} + +\item{distance_col}{If given, will add a column with this +name containing the geographical distance between the two} + +\item{...}{Extra arguments passed on to the distance method} + +\item{x}{A tbl} + +\item{y}{A tbl} + \item{by}{Columns by which to join the two tables} \item{max_dist}{Maximum distance to use for joining} @@ -55,6 +100,12 @@ name containing the geographical distance between the two} \item{...}{Extra arguments passed on to the distance method} } \description{ +This allows joining based on combinations of longitudes and latitudes. If +you are using a distance metric that is *not* based on latitude and +longitude, use \code{\link{distance_join}} instead. Distances are +calculated based on the \code{distHaversine}, \code{distGeo}, +\code{distCosine}, etc methods in the geosphere package. + This allows joining based on combinations of longitudes and latitudes. If you are using a distance metric that is *not* based on latitude and longitude, use \code{\link{distance_join}} instead. Distances are @@ -67,12 +118,52 @@ approximately the fastest method. Note that by far the slowest method is vincentyellipsoid, and on fuzzy joins should only be used when there are very few pairs and accuracy is imperative. +If you need to use a custom geo method, you may want to write it directly +with the \code{multi_by} and \code{multi_match_fun} arguments to +\code{fuzzy_join}. + +"Haversine" was chosen as default since in some tests it is +approximately the fastest method. Note that by far the slowest method is +vincentyellipsoid, and on fuzzy joins should only be used when there are +very few pairs and accuracy is imperative. + If you need to use a custom geo method, you may want to write it directly with the \code{multi_by} and \code{multi_match_fun} arguments to \code{fuzzy_join}. } \examples{ +library(dplyr) +data("state") + +# find pairs of US states whose centers are within +# 200 miles of each other +states <- data_frame(state = state.name, + longitude = state.center$x, + latitude = state.center$y) + +s1 <- rename(states, state1 = state) +s2 <- rename(states, state2 = state) + +pairs <- s1 \%>\% + geo_inner_join(s2, max_dist = 200) \%>\% + filter(state1 != state2) + +pairs + +# plot them +library(ggplot2) +ggplot(pairs, aes(x = longitude.x, y = latitude.x, + xend = longitude.y, yend = latitude.y)) + + geom_segment(color = "red") + + borders("state") + + theme_void() + +# also get distances +s1 \%>\% + geo_inner_join(s2, max_dist = 200, distance_col = "distance") + + library(dplyr) data("state") From 51ded7bdebbc5f879d92916fd937c3fcee9d3069 Mon Sep 17 00:00:00 2001 From: antaldaniel Date: Sun, 18 Nov 2018 15:57:35 +0100 Subject: [PATCH 2/2] typo --- R/geo_join.R | 2 +- man/geo_join.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/geo_join.R b/R/geo_join.R index af54a78..a6833ec 100644 --- a/R/geo_join.R +++ b/R/geo_join.R @@ -10,7 +10,7 @@ #' @param y A tbl #' @param by Columns by which to join the two tables. Must contain 'lon' #' for longitute, and 'lat' for latitude, such as 'lat', or -#' 'lat.y', or 'lattitude'. +#' 'lat.y', or 'latitude'. #' @param max_dist Maximum distance to use for joining #' @param method Method to use for computing distance: one of #' "haversine" (default), "geo", "cosine", "meeus", "vincentysphere", diff --git a/man/geo_join.Rd b/man/geo_join.Rd index 7b06ed2..47e2f9a 100644 --- a/man/geo_join.Rd +++ b/man/geo_join.Rd @@ -61,7 +61,7 @@ geo_anti_join(x, y, by = NULL, method = "haversine", max_dist = 1, \item{by}{Columns by which to join the two tables. Must contain 'lon' for longitute, and 'lat' for latitude, such as 'lat', or -'lat.y', or 'lattitude'.} +'lat.y', or 'latitude'.} \item{max_dist}{Maximum distance to use for joining}