Port all initial vignettes

rzats · rzats · commit 99fee9802f61 · 2024-08-27T18:43:11.000+03:00
diff --git a/docs/index.rst b/docs/index.rst
@@ -76,3 +76,6 @@ Contents
 
    getting_started_with_epidatpy
 
+   signal_discovery
+
+   versioned_data
diff --git a/docs/signal_discovery.rst b/docs/signal_discovery.rst
@@ -0,0 +1,83 @@
+
+Finding data sources and signals of interest
+============================================
+
+The Epidata API includes numerous data streams -- medical claims data, cases and deaths,
+mobility, and many others -- covering different geographic regions. This can make it a
+challenge to find the data stream that you are most interested in.
+
+Example queries with all the endpoint functions available in this package are
+given below.
+
+
+Using the documentation
+-----------------------
+
+The Epidata documentation lists all the data sources and signals available
+through the API for
+`COVID-19 <https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html>`_ and
+for `other diseases <https://cmu-delphi.github.io/delphi-epidata/api/README.html#source-specific-parameters>`_.
+The site also includes a search tool if you have a keyword (e.g. "Taiwan") in mind.
+
+
+Signal metadata
+---------------
+
+The ``source_df`` property lets us obtain a Pandas DataFrame of metadata describing all
+data streams which are publically accessible from the COVIDcast API. See the `data source
+and signals documentation <https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html>`_
+for descriptions of the available sources.
+
+>>> from epidatpy import CovidcastEpidata
+>>> epidata = CovidcastEpidata()
+>>> sources = epidata.source_df
+>>> sources.head()
+            source                                         name                                        description          reference_signal                                            license                                                dua                                            signals
+0             chng                            Change Healthcare  Change Healthcare is a healthcare technology c...   smoothed_outpatient_cli                                           CC BY-NC  https://cmu.box.com/s/cto4to822zecr3oyq1kkk9xm...  smoothed_outpatient_cli,smoothed_adj_outpatien...
+1    covid-act-now                          Covid Act Now (CAN)  COVID Act Now (CAN) tracks COVID-19 testing st...  pcr_specimen_total_tests                                           CC BY-NC                                               None  pcr_specimen_positivity_rate,pcr_specimen_tota...
+2    doctor-visits                    Doctor Visits From Claims  Information about outpatient visits, provided ...              smoothed_cli                                              CC BY  https://cmu.box.com/s/l2tz6kmiws6jyty2azwb43po...                      smoothed_cli,smoothed_adj_cli
+3        fb-survey  Delphi US COVID-19 Trends and Impact Survey  We conduct the Delphi US COVID-19 Trends and I...              smoothed_cli                                              CC BY  https://cmu.box.com/s/qfxplcdrcn9retfzx4zniyug...  raw_wcli,raw_cli,smoothed_cli,smoothed_wcli,ra...
+4  google-symptoms                Google Symptoms Search Trends  Google's [COVID-19 Search Trends symptoms data...       s05_smoothed_search  To download or use the data, you must agree to...                                               None  ageusia_raw_search,ageusia_smoothed_search,ano...
+
+This DataFrame contains the following columns:
+
+- ``source`` - Data source name.
+- ``signal`` - Signal name.
+- ``description`` - Description of the signal.
+- ``reference_signal`` - Geographic level for which this signal is available, such as county, state, msa, hss, hrr, or nation. Most signals are available at multiple geographic levels and will hence be listed in multiple rows with their own metadata.
+- ``license`` - The license
+- ``dua`` - Link to the Data Use Agreement.
+
+The ``signal_df`` DataFrame can also be used to obtain information about the signals
+that are available - for example, what time range they are available for,
+and when they have been updated.
+
+>>> signals = epidata.signal_df
+>>> signals.head()
+  source                         signal                                          name  active                                  short_description                                        description time_type time_label value_label format category high_values_are  is_smoothed is_weighted is_cumulative has_stderr has_sample_size                        geo_types
+0   chng        smoothed_outpatient_cli                   COVID-Related Doctor Visits   False  Estimated percentage of outpatient doctor visi...  Estimated percentage of outpatient doctor visi...       day       Date       Value    raw    early             bad         True       False         False      False           False  county,hhs,hrr,msa,nation,state
+1   chng    smoothed_adj_outpatient_cli    COVID-Related Doctor Visits (Day-adjusted)   False  Estimated percentage of outpatient doctor visi...  Estimated percentage of outpatient doctor visi...       day       Date       Value    raw    early             bad         True       False         False      False           False  county,hhs,hrr,msa,nation,state
+2   chng      smoothed_outpatient_covid                 COVID-Confirmed Doctor Visits   False                      COVID-Confirmed Doctor Visits  Estimated percentage of outpatient doctor visi...       day       Date       Value    raw    early             bad         True       False         False      False           False  county,hhs,hrr,msa,nation,state
+3   chng  smoothed_adj_outpatient_covid  COVID-Confirmed Doctor Visits (Day-adjusted)   False                      COVID-Confirmed Doctor Visits  Estimated percentage of outpatient doctor visi...       day       Date       Value    raw    early             bad         True       False         False      False           False  county,hhs,hrr,msa,nation,state
+4   chng        smoothed_outpatient_flu             Influenza-Confirmed Doctor Visits   False  Estimated percentage of outpatient doctor visi...  Estimated percentage of outpatient doctor visi...       day        Day       Value    raw    early             bad         True       False         False       None            None  county,hhs,hrr,msa,nation,state
+
+This DataFrame contains one row each available signal, with the following columns:
+
+- ``data_source`` - Data source name.
+- ``signal`` - Signal name.
+- ``name`` - Name of signal.
+- ``active`` - Whether the signal is currently not updated or not. Signals may be inactive because the sources have become unavailable, other sources have replaced them, or additional work is required for us to continue updating them.
+- ``short_description`` - Brief description of the signal.
+- ``description`` - Full description of the signal.
+- ``geo_types`` - Spatial resolution of the signal (e.g., `county`, `hrr`, `msa`, `dma`, `state`). More detail about all `geo_types` is given in the `geographic coding documentation <https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html>`_.
+- ``time_type`` - Temporal resolution of the signal (e.g., day, week; see `date coding details <https://cmu-delphi.github.io/delphi-epidata/api/covidcast_times.html>`_).
+- ``time_label`` - The time label ("Date", "Week").
+- ``value_label`` - The value label ("Value", "Percentage", "Visits", "Visits per 100,000 people").
+- ``format`` - The value format ("per100k", "percent", "fraction", "count", "raw").
+- ``category`` - The signal category ("early", "public", "late", "other").
+- ``high_values_are``- What the higher value of signal indicates ("good", "bad", "neutral").
+- ``is_smoothed`` - Whether the signal is smoothed.
+- ``is_weighted`` - Whether the signal is weighted.
+- ``is_cumulative`` - Whether the signal is cumulative.
+- ``has_stderr`` - Whether the signal has `stderr` statistic.
+- ``has_sample_size`` - Whether the signal has `sample_size` statistic.
diff --git a/docs/versioned_data.rst b/docs/versioned_data.rst
@@ -0,0 +1,154 @@
+Understanding and accessing versioned data
+==========================================
+
+
+The Epidata API records not just each signal's estimate for a given location
+on a given day, but also *when* that estimate was made, and all updates to that
+estimate.
+
+For example, let's look at the `doctor visits
+signal <https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html>`_
+from the ``covidcast`` `endpoint <https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html>`_,
+which estimates the percentage of outpatient doctor visits that are
+COVID-related.
+
+Consider a result row with ``time_value = 2020-05-01`` for
+``geo_values = "pa"``. This is an estimate for Pennsylvania on
+May 1, 2020. That estimate was *issued* on May 5, 2020, the delay being due to
+the aggregation of data by our source and the time taken by the Epidata API to
+ingest the data provided.
+
+Later, the estimate for May 1st could be updated,
+perhaps because additional visit data from May 1st arrived at our source and was
+reported to us. This constitutes a new *issue* of the data.
+
+
+Data known "as of" a specific date
+----------------------------------
+
+By default, endpoint functions fetch the most recent issue available. This
+is the best option for users who simply want to graph the latest data or
+construct dashboards. But if we are interested in knowing *when* data was
+reported, we can request specific data versions using the ``as_of``, ``issues``, or
+``lag`` arguments.
+
+**Note** that these are mutually exclusive; only one can be specified
+at a time. Also, not all endpoints support all three parameters, so please
+check the documentation for that specific endpoint.
+
+First, we can request the data that was available *as of* a specific date, using
+the ``as_of`` argument:
+
+>>> from epidatpy import EpiDataContext, EpiRange
+>>> epidata = EpiDataContext(use_cache=True, cache_max_age_days=1)
+>>> apicall = epidata.pub_covidcast(
+...    data_source = "doctor-visits",
+...    signals = "smoothed_cli", 
+...    time_type = "day",
+...    time_values = EpiRange("2020-05-01", "2020-05-01"),
+...    geo_type = "state",
+...    geo_values = "pa",
+...    as_of = "2020-05-07"
+...)
+>>> apicall.df.head()
+          source        signal geo_type geo_value time_type time_value      issue  lag    value  stderr sample_size  direction  missing_value  missing_stderr  missing_sample_size
+0  doctor-visits  smoothed_cli    state        pa       day 2020-05-01 2020-05-07    6  2.32192    <NA>        <NA>       <NA>              0               5                    5
+
+This shows that an estimate of about 2.3% was issued on May 7. If we don't
+specify `as_of`, we get the most recent estimate available:
+
+>>> apicall = epidata.pub_covidcast(
+...    data_source = "doctor-visits",
+...    signals = "smoothed_cli", 
+...    time_type = "day",
+...    time_values = EpiRange("2020-05-01", "2020-05-01"),
+...    geo_type = "state",
+...    geo_values = "pa"
+...)
+>>> apicall.df.head()
+          source        signal geo_type geo_value time_type time_value      issue  lag     value  stderr sample_size  direction  missing_value  missing_stderr  missing_sample_size
+0  doctor-visits  smoothed_cli    state        pa       day 2020-05-01 2020-07-04   64  5.075015    <NA>        <NA>       <NA>              0               5                    5
+
+Note the substantial change in the estimate, from less than 3% to over 5%,
+reflecting new data that became available after May 7 about visits *occurring on*
+May 1. This illustrates the importance of issue date tracking, particularly
+for forecasting tasks. To backtest a forecasting model on past data, it is
+important to use the data that would have been available *at the time* the model
+was or would have been fit, not data that arrived much later.
+
+Multiple issues of observations
+-------------------------------
+
+By using the ``issues`` argument, we can request all issues in a certain time
+period:
+
+>>> apicall = epidata.pub_covidcast(
+...    data_source = "doctor-visits",
+...    signals = "smoothed_adj_cli",
+...    time_type = "day",
+...    time_values = EpiRange("2020-05-01", "2020-05-01"),
+...    geo_type = "state",
+...    geo_values = "pa",
+...    issues = EpiRange("2020-05-01", "2020-05-15")
+...)
+>>> apicall.df.head(7)
+          source            signal geo_type geo_value time_type time_value      issue  lag     value  stderr sample_size  direction  missing_value  missing_stderr  missing_sample_size
+0  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-07    6  2.581509    <NA>        <NA>       <NA>              0               5                    5
+1  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-08    7  3.278896    <NA>        <NA>       <NA>              0               5                    5
+2  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-09    8  3.321781    <NA>        <NA>       <NA>              0               5                    5
+3  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-12   11  3.588683    <NA>        <NA>       <NA>              0               5                    5
+4  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-13   12  3.631978    <NA>        <NA>       <NA>              0               5                    5
+5  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-14   13  3.658009    <NA>        <NA>       <NA>              0               5                    5
+
+This estimate was clearly updated many times as new data for May 1st arrived.
+
+**Note** that these results include only data issued or updated between
+(inclusive) 2020-05-01 and 2020-05-15. If a value was first reported on
+2020-04-15, and never updated, a query for issues between 2020-05-01 and
+2020-05-15 will not include that value among its results.
+
+Observations issued with a specific lag
+---------------------------------------
+
+Finally, we can use the ``lag`` argument to request only data reported with a
+certain lag. For example, requesting a lag of 7 days fetches only data issued
+exactly 7 days after the corresponding ``time_value``:
+
+>>> apicall = epidata.pub_covidcast(
+...    data_source = "doctor-visits",
+...    signals = "smoothed_adj_cli",
+...    time_type = "day",
+...    time_values = EpiRange("2020-05-01", "2020-05-07"),
+...    geo_type = "state",
+...    geo_values = "pa",
+...    lag = 7
+...)
+>>> apicall.df.head()
+          source            signal geo_type geo_value time_type time_value      issue  lag     value  stderr sample_size  direction  missing_value  missing_stderr  missing_sample_size
+0  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-01 2020-05-08    7  3.278896    <NA>        <NA>       <NA>              0               5                    5
+1  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-02 2020-05-09    7  3.225292    <NA>        <NA>       <NA>              0               5                    5
+2  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-05 2020-05-12    7  2.779908    <NA>        <NA>       <NA>              0               5                    5
+3  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-06 2020-05-13    7  2.557698    <NA>        <NA>       <NA>              0               5                    5
+4  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-07 2020-05-14    7  2.191677    <NA>        <NA>       <NA>              0               5                    5
+
+**Note** that though this query requested all values between 2020-05-01 and
+2020-05-07, May 3rd and May 4th were *not* included in the results set. This is
+because the query will only include a result for May 3rd if a value were issued
+on May 10th (a 7-day lag), but in fact the value was not updated on that day:
+
+>>> apicall = epidata.pub_covidcast(
+...    data_source = "doctor-visits",
+...    signals = "smoothed_adj_cli",
+...    time_type = "day",
+...    time_values = EpiRange("2020-05-03", "2020-05-03"),
+...    geo_type = "state",
+...    geo_values = "pa",
+...    issues = EpiRange("2020-05-09", "2020-05-15")
+...)
+>>> apicall.df.head()
+          source            signal geo_type geo_value time_type time_value      issue  lag     value  stderr sample_size  direction  missing_value  missing_stderr  missing_sample_size
+0  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-03 2020-05-09    6  2.788618    <NA>        <NA>       <NA>              0               5                    5
+1  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-03 2020-05-12    9  3.015368    <NA>        <NA>       <NA>              0               5                    5
+2  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-03 2020-05-13   10   3.03931    <NA>        <NA>       <NA>              0               5                    5
+3  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-03 2020-05-14   11  3.021245    <NA>        <NA>       <NA>              0               5                    5
+4  doctor-visits  smoothed_adj_cli    state        pa       day 2020-05-03 2020-05-15   12  3.048725    <NA>        <NA>       <NA>              0               5                    5
diff --git a/docs_smoke_test.py b/docs_smoke_test.py
@@ -1,7 +1,16 @@
 from epidatpy import CovidcastEpidata, EpiDataContext, EpiRange
 import pandas as pd
 
+# Set common options and context
+
+pd.set_option('display.max_columns', None)
+pd.set_option('display.max_rows', None)
+pd.set_option('display.width', 1000)
+
 epidata = EpiDataContext(use_cache=True, cache_max_age_days=1)
+
+# Getting started with epidatpy
+
 apicall = epidata.pub_covidcast(
     data_source = "fb-survey",
     signals = "smoothed_cli", 
@@ -11,10 +20,6 @@
     time_values = EpiRange(20210405, 20210410))
 print(apicall)
 
-pd.set_option('display.max_columns', None)
-pd.set_option('display.max_rows', None)
-pd.set_option('display.width', 1000)
-
 data = apicall.df()
 print(data.head())
 
@@ -72,4 +77,84 @@
 
 data.plot(x="time_value", y="value", title="Smoothed CLI from Facebook Survey", xlabel="Date", ylabel="CLI")
 plt.subplots_adjust(bottom=.2)
-plt.show()
+plt.show()
+
+# Signal discovery
+
+epidata2 = CovidcastEpidata()
+sources = epidata2.source_df
+print(sources.head())
+
+signals = epidata2.signal_df
+print(signals.head())
+
+# Versioned data
+
+apicall6 = epidata.pub_covidcast(
+    data_source = "doctor-visits",
+    signals = "smoothed_cli", 
+    time_type = "day",
+    time_values = EpiRange("2020-05-01", "2020-05-01"),
+    geo_type = "state",
+    geo_values = "pa",
+    as_of = "2020-05-07"
+)
+print(apicall6)
+
+data6 = apicall6.df()
+print(data6.head())
+
+apicall7 = epidata.pub_covidcast(
+    data_source = "doctor-visits",
+    signals = "smoothed_cli", 
+    time_type = "day",
+    time_values = EpiRange("2020-05-01", "2020-05-01"),
+    geo_type = "state",
+    geo_values = "pa"
+)
+print(apicall7)
+
+data7 = apicall7.df()
+print(data7.head())
+
+apicall8 = epidata.pub_covidcast(
+    data_source = "doctor-visits",
+    signals = "smoothed_adj_cli",
+    time_type = "day",
+    time_values = EpiRange("2020-05-01", "2020-05-01"),
+    geo_type = "state",
+    geo_values = "pa",
+    issues = EpiRange("2020-05-01", "2020-05-15")
+)
+print(apicall8)
+
+data8 = apicall8.df()
+print(data8.head(7))
+
+apicall9 = epidata.pub_covidcast(
+    data_source = "doctor-visits",
+    signals = "smoothed_adj_cli",
+    time_type = "day",
+    time_values = EpiRange("2020-05-01", "2020-05-07"),
+    geo_type = "state",
+    geo_values = "pa",
+    lag = 7
+)
+print(apicall9)
+
+data9 = apicall9.df()
+print(data9.head())
+
+apicall10 = epidata.pub_covidcast(
+    data_source = "doctor-visits",
+    signals = "smoothed_adj_cli",
+    time_type = "day",
+    time_values = EpiRange("2020-05-03", "2020-05-03"),
+    geo_type = "state",
+    geo_values = "pa",
+    issues = EpiRange("2020-05-09", "2020-05-15")
+)
+print(apicall10)
+
+data10 = apicall10.df()
+print(data10.head())