You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
256 lines
9.2 KiB
256 lines
9.2 KiB
""" Google BigQuery support """
|
|
from __future__ import annotations
|
|
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
)
|
|
import warnings
|
|
|
|
from pandas.compat._optional import import_optional_dependency
|
|
from pandas.util._exceptions import find_stack_level
|
|
|
|
if TYPE_CHECKING:
|
|
import google.auth
|
|
|
|
from pandas import DataFrame
|
|
|
|
|
|
def _try_import():
|
|
# since pandas is a dependency of pandas-gbq
|
|
# we need to import on first use
|
|
msg = (
|
|
"pandas-gbq is required to load data from Google BigQuery. "
|
|
"See the docs: https://pandas-gbq.readthedocs.io."
|
|
)
|
|
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
|
|
return pandas_gbq
|
|
|
|
|
|
def read_gbq(
|
|
query: str,
|
|
project_id: str | None = None,
|
|
index_col: str | None = None,
|
|
col_order: list[str] | None = None,
|
|
reauth: bool = False,
|
|
auth_local_webserver: bool = True,
|
|
dialect: str | None = None,
|
|
location: str | None = None,
|
|
configuration: dict[str, Any] | None = None,
|
|
credentials: google.auth.credentials.Credentials | None = None,
|
|
use_bqstorage_api: bool | None = None,
|
|
max_results: int | None = None,
|
|
progress_bar_type: str | None = None,
|
|
) -> DataFrame:
|
|
"""
|
|
Load data from Google BigQuery.
|
|
|
|
.. deprecated:: 2.2.0
|
|
|
|
Please use ``pandas_gbq.read_gbq`` instead.
|
|
|
|
This function requires the `pandas-gbq package
|
|
<https://pandas-gbq.readthedocs.io>`__.
|
|
|
|
See the `How to authenticate with Google BigQuery
|
|
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
|
|
guide for authentication instructions.
|
|
|
|
Parameters
|
|
----------
|
|
query : str
|
|
SQL-Like Query to return data values.
|
|
project_id : str, optional
|
|
Google BigQuery Account project ID. Optional when available from
|
|
the environment.
|
|
index_col : str, optional
|
|
Name of result column to use for index in results DataFrame.
|
|
col_order : list(str), optional
|
|
List of BigQuery column names in the desired order for results
|
|
DataFrame.
|
|
reauth : bool, default False
|
|
Force Google BigQuery to re-authenticate the user. This is useful
|
|
if multiple accounts are used.
|
|
auth_local_webserver : bool, default True
|
|
Use the `local webserver flow`_ instead of the `console flow`_
|
|
when getting user credentials.
|
|
|
|
.. _local webserver flow:
|
|
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
|
|
.. _console flow:
|
|
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
|
|
|
|
*New in version 0.2.0 of pandas-gbq*.
|
|
|
|
.. versionchanged:: 1.5.0
|
|
Default value is changed to ``True``. Google has deprecated the
|
|
``auth_local_webserver = False`` `"out of band" (copy-paste)
|
|
flow
|
|
<https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
|
|
dialect : str, default 'legacy'
|
|
Note: The default value is changing to 'standard' in a future version.
|
|
|
|
SQL syntax dialect to use. Value can be one of:
|
|
|
|
``'legacy'``
|
|
Use BigQuery's legacy SQL dialect. For more information see
|
|
`BigQuery Legacy SQL Reference
|
|
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
|
|
``'standard'``
|
|
Use BigQuery's standard SQL, which is
|
|
compliant with the SQL 2011 standard. For more information
|
|
see `BigQuery Standard SQL Reference
|
|
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
|
|
location : str, optional
|
|
Location where the query job should run. See the `BigQuery locations
|
|
documentation
|
|
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
|
|
list of available locations. The location must match that of any
|
|
datasets used in the query.
|
|
|
|
*New in version 0.5.0 of pandas-gbq*.
|
|
configuration : dict, optional
|
|
Query config parameters for job processing.
|
|
For example:
|
|
|
|
configuration = {'query': {'useQueryCache': False}}
|
|
|
|
For more information see `BigQuery REST API Reference
|
|
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
|
|
credentials : google.auth.credentials.Credentials, optional
|
|
Credentials for accessing Google APIs. Use this parameter to override
|
|
default credentials, such as to use Compute Engine
|
|
:class:`google.auth.compute_engine.Credentials` or Service Account
|
|
:class:`google.oauth2.service_account.Credentials` directly.
|
|
|
|
*New in version 0.8.0 of pandas-gbq*.
|
|
use_bqstorage_api : bool, default False
|
|
Use the `BigQuery Storage API
|
|
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
|
|
download query results quickly, but at an increased cost. To use this
|
|
API, first `enable it in the Cloud Console
|
|
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
|
|
You must also have the `bigquery.readsessions.create
|
|
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
|
|
permission on the project you are billing queries to.
|
|
|
|
This feature requires version 0.10.0 or later of the ``pandas-gbq``
|
|
package. It also requires the ``google-cloud-bigquery-storage`` and
|
|
``fastavro`` packages.
|
|
|
|
max_results : int, optional
|
|
If set, limit the maximum number of rows to fetch from the query
|
|
results.
|
|
|
|
progress_bar_type : Optional, str
|
|
If set, use the `tqdm <https://tqdm.github.io/>`__ library to
|
|
display a progress bar while the data downloads. Install the
|
|
``tqdm`` package to use this feature.
|
|
|
|
Possible values of ``progress_bar_type`` include:
|
|
|
|
``None``
|
|
No progress bar.
|
|
``'tqdm'``
|
|
Use the :func:`tqdm.tqdm` function to print a progress bar
|
|
to :data:`sys.stderr`.
|
|
``'tqdm_notebook'``
|
|
Use the :func:`tqdm.tqdm_notebook` function to display a
|
|
progress bar as a Jupyter notebook widget.
|
|
``'tqdm_gui'``
|
|
Use the :func:`tqdm.tqdm_gui` function to display a
|
|
progress bar as a graphical dialog box.
|
|
|
|
Returns
|
|
-------
|
|
df: DataFrame
|
|
DataFrame representing results of query.
|
|
|
|
See Also
|
|
--------
|
|
pandas_gbq.read_gbq : This function in the pandas-gbq library.
|
|
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
|
|
|
|
Examples
|
|
--------
|
|
Example taken from `Google BigQuery documentation
|
|
<https://cloud.google.com/bigquery/docs/pandas-gbq-migration>`_
|
|
|
|
>>> sql = "SELECT name FROM table_name WHERE state = 'TX' LIMIT 100;"
|
|
>>> df = pd.read_gbq(sql, dialect="standard") # doctest: +SKIP
|
|
>>> project_id = "your-project-id" # doctest: +SKIP
|
|
>>> df = pd.read_gbq(sql,
|
|
... project_id=project_id,
|
|
... dialect="standard"
|
|
... ) # doctest: +SKIP
|
|
"""
|
|
warnings.warn(
|
|
"read_gbq is deprecated and will be removed in a future version. "
|
|
"Please use pandas_gbq.read_gbq instead: "
|
|
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
pandas_gbq = _try_import()
|
|
|
|
kwargs: dict[str, str | bool | int | None] = {}
|
|
|
|
# START: new kwargs. Don't populate unless explicitly set.
|
|
if use_bqstorage_api is not None:
|
|
kwargs["use_bqstorage_api"] = use_bqstorage_api
|
|
if max_results is not None:
|
|
kwargs["max_results"] = max_results
|
|
|
|
kwargs["progress_bar_type"] = progress_bar_type
|
|
# END: new kwargs
|
|
|
|
return pandas_gbq.read_gbq(
|
|
query,
|
|
project_id=project_id,
|
|
index_col=index_col,
|
|
col_order=col_order,
|
|
reauth=reauth,
|
|
auth_local_webserver=auth_local_webserver,
|
|
dialect=dialect,
|
|
location=location,
|
|
configuration=configuration,
|
|
credentials=credentials,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
def to_gbq(
|
|
dataframe: DataFrame,
|
|
destination_table: str,
|
|
project_id: str | None = None,
|
|
chunksize: int | None = None,
|
|
reauth: bool = False,
|
|
if_exists: str = "fail",
|
|
auth_local_webserver: bool = True,
|
|
table_schema: list[dict[str, str]] | None = None,
|
|
location: str | None = None,
|
|
progress_bar: bool = True,
|
|
credentials: google.auth.credentials.Credentials | None = None,
|
|
) -> None:
|
|
warnings.warn(
|
|
"to_gbq is deprecated and will be removed in a future version. "
|
|
"Please use pandas_gbq.to_gbq instead: "
|
|
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
pandas_gbq = _try_import()
|
|
pandas_gbq.to_gbq(
|
|
dataframe,
|
|
destination_table,
|
|
project_id=project_id,
|
|
chunksize=chunksize,
|
|
reauth=reauth,
|
|
if_exists=if_exists,
|
|
auth_local_webserver=auth_local_webserver,
|
|
table_schema=table_schema,
|
|
location=location,
|
|
progress_bar=progress_bar,
|
|
credentials=credentials,
|
|
)
|