index.html

<!DOCTYPE html>
<html lang="en">
  <head>
    <!-- Google tag (gtag.js) -->
    <script
      async
      src="https://www.googletagmanager.com/gtag/js?id=G-25389D1SR4"
    ></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag() {
        dataLayer.push(arguments);
      }
      gtag("js", new Date());

      gtag("config", "G-25389D1SR4");
    </script>

    <meta charset="utf-8" />
    <meta
      name="viewport"
      content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no"
    />

    <meta name="robots" content="index,follow" />
    <meta
      name="keywords"
      content="technology, workshop, data visualization, Python, data science, slides"
    />
    <meta name="theme-color" content="#000" />
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:site" content="@StefanieMolin" />
    <meta name="twitter:creator" content="@StefanieMolin" />
    <meta property="og:type" content="website" />
    <meta property="og:locale" content="en_US" />
    <meta property="og:site_name" content="Stefanie Molin" />
    <meta name="author" content="Stefanie Molin" />
    <meta name="referrer" content="origin" />
    <meta
      property="og:url"
      content="https://stefaniemolin.com/data-morph-talk/"
    />
    <meta
      property="og:title"
      content="Data Morph: A Cautionary Tale of Summary Statistics | Stefanie Molin"
    />
    <meta
      name="description"
      content="Relying solely on simple summary statistics like the mean, median, or standard deviation is not enough to describe complex data. Come and see why this is the case and learn what it takes to translate research into an open-source library."
    />
    <meta
      property="og:description"
      content="Relying solely on simple summary statistics like the mean, median, or standard deviation is not enough to describe complex data. Come and see why this is the case and learn what it takes to translate research into an open-source library."
    />
    <meta
      property="og:image"
      content="https://stefaniemolin.com/assets/articles/data-science/introducing-data-morph/panda-to-star.gif"
    />
    <meta property="og:image:width" content="774" />
    <meta property="og:image:height" content="379" />
    <meta
      property="og:image:alt"
      content="Data Morph: A Cautionary Tale of Summary Statistics"
    />

    <title>
      Data Morph: A Cautionary Tale of Summary Statistics slides | Stefanie
      Molin
    </title>
    <link rel="manifest" href="/favicon/site.webmanifest" />
    <link rel="shortcut icon" type="image/x-icon" href="/favicon/favicon.ico" />
    <link
      rel="apple-touch-icon"
      sizes="180x180"
      href="/favicon/apple-touch-icon.png"
    />
    <link
      rel="icon"
      type="image/png"
      sizes="32x32"
      href="/favicon/favicon-32x32.png"
    />
    <link
      rel="icon"
      type="image/png"
      sizes="16x16"
      href="/favicon/favicon-16x16.png"
    />
    <link
      rel="icon"
      type="image/png"
      sizes="192x192"
      href="/favicon/android-chrome-192x192.png"
    />
    <link
      rel="icon"
      type="image/png"
      sizes="512x512"
      href="/favicon/android-chrome-512x512.png"
    />

    <link
      rel="stylesheet"
      href="https://unpkg.com/reveal.js@5.1.0/dist/reset.css"
    />
    <link
      rel="stylesheet"
      href="https://unpkg.com/reveal.js@5.1.0/dist/reveal.css"
    />
    <link
      rel="stylesheet"
      href="https://unpkg.com/reveal.js@5.1.0/dist/theme/simple.css"
    />

    <!--syntax highlighting in code snippets (light theme)-->
    <link
      rel="stylesheet"
      href="https://unpkg.com/@highlightjs/cdn-assets@11.9.0/styles/stackoverflow-light.min.css"
    />

    <!-- Font Awesome icons -->
    <link
      href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.1.1/css/all.min.css"
      rel="stylesheet"
      type="text/css"
    />

    <style type="text/css">
      :root {
        --r-heading1-size: 2em;
        --r-heading2-size: 1.2em;
        --r-heading3-size: 1em;
        --r-heading4-size: 0.85em;
      }
      .reveal .slides > section,
      .reveal .pdf-page > section,
      .reveal .slides > .scroll-page {
        text-align: left !important;
      }
      img {
        padding: 0 !important;
        margin: 0 !important;
      }
      .footnotes {
        text-indent: -5px;
        padding-left: 20px;
        width: 90%;
      }
      .footnote {
        text-align: left;
      }
      .footnote::before {
        content: "*";
      }
      h6 {
        font-size: 0.8em;
      }
      ul,
      ol,
      p {
        font-size: 0.65em;
      }
      ul.references {
        font-size: 0.55em;
      }
      #bio ul {
        font-size: 0.715em;
      }
      ul > li:not(:last-child),
      ol > li:not(:last-child) {
        margin-bottom: 0.5em;
      }
      .center {
        text-align: center;
      }
      small {
        font-size: 0.3em !important;
        text-align: center;
      }
      .footer {
        position: absolute;
        bottom: 10px;
        text-wrap: nowrap;
        width: 100%;
        text-align: center;
        z-index: 100;
        font-size: 4vmin;
      }
      .license {
        font-size: 1.5vmin;
        padding-top: 0.5vmin;
      }
      pre > code {
        padding: 20px !important;
        border-radius: 10px;
      }
      pre.code-wrapper {
        border-radius: 10px;
      }
      :not(pre) > code {
        background-color: #eee;
        padding: 0.125rem 0.25rem;
        border-radius: 5px;
      }
      .hide-line-numbers .hljs-ln-numbers {
        display: none;
      }
      .r-stack-left {
        justify-content: start;
      }
      .r-stack-left > p {
        margin: 0 !important;
      }
    </style>
  </head>
  <body>
    <div class="reveal">
      <div id="footer-info" style="display: none">
        <div class="footer">
          <a href="https://stefaniemolin.com">stefaniemolin.com</a>
          <div class="license">
            License:
            <a
              href="http://creativecommons.org/licenses/by-nc-sa/4.0/"
              style="z-index: 1"
              target="_blank"
              rel="noopener noreferrer"
            >
              CC BY-NC-SA 4.0
            </a>
          </div>
        </div>
      </div>
      <div class="slides">
        <section>
          <h1 class="center">Data Morph</h1>
          <h2 class="center">A Cautionary Tale of Summary Statistics</h2>
          <br />
          <br />
          <h3 class="center">Stefanie Molin</h3>
        </section>
        <section id="bio">
          <h2>Bio</h2>
          <ul>
            <li>👩🏻‍💻 Software engineer at Bloomberg in NYC</li>
            <li>✨ Founding member of Bloomberg's Data Science Community</li>
            <li>
              ✍ Author of "<a
                href="https://stefaniemolin.com/books/Hands-On-Data-Analysis-with-Pandas-2nd-edition/"
                >Hands-On Data Analysis with Pandas</a
              >"
            </li>
            <li>
              🎓 Bachelor's in operations research from Columbia University
            </li>
            <li>
              🎓 Master's in computer science (ML specialization) from Georgia
              Tech
            </li>
          </ul>
          <aside class="notes">
            <h2>Talk outline</h2>
            <div>
              <ul>
                <li>Why summary statistics aren't enough</li>
                <li>Introduction to Data Morph</li>
                <li>How Data Morph works</li>
                <li>Limitations and areas for future work</li>
                <li>Lessons learned and challenges faced</li>
              </ul>
            </div>
          </aside>
        </section>
        <section id="summary-statistics">
          <h2 class="center">Summary statistics aren't enough</h2>
        </section>
        <section id="visually-different-datasets">
          <p>These datasets are clearly different:</p>
          <div class="center">
            <img src="media/example_datasets.png" alt="example datasets" />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="visually-different-same-statistics">
          <p>
            However, we would not know that if we were to only look at the
            summary statistics:
          </p>
          <div class="center">
            <img src="media/stats.png" alt="summary statistics are the same" />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="moments">
          <p>
            What we call <em>summary statistics</em> summarize only part of the
            distribution. We need many <b>moments</b> to describe the shape of a
            distribution (and distinguish between these datasets):
          </p>
          <div class="center">
            <img src="media/moments.png" alt="moments" />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <div>
              <small class="footnote"
                >The first moment is the center of mass of the distribution (the
                mean); here, we have central moments, which are independent of
                translation, so our first moment is zero (we subtract the mean).
                The second moment is the variance, but once we get to the third
                moment (skewness), we can differentiate between these datasets.
                Further moments, like kurtosis (fourth), provide even more
                information.</small
              >
            </div>
            <div style="margin-top: -8px">
              <small class="footnote"
                >The Python logo is a
                <a
                  href="https://www.python.org/psf/trademarks/"
                  target="_blank"
                  rel="noopener noreferrer"
                  >trademark of the Python Software Foundation (PSF)</a
                >, used with permission from the Foundation.</small
              >
            </div>
          </div>
        </section>
        <section id="marginal-distributions">
          <p>
            Adding in histograms for the marginal distributions, we can see the
            distributions of both <em>x</em> and <em>y</em> are indeed quite
            different across datasets. Some of these differences are captured in
            the third moment (<b>skewness</b>) and the fourth moment
            (<b>kurtosis</b>), which measure the asymmetry and weight in the
            tails of the distribution, respectively:
          </p>
          <div class="center">
            <img src="media/with_marginals.png" alt="marginal distributions" />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="correlation">
          <p>
            However, the moments aren't capturing the relationship between
            <em>x</em> and <em>y</em>. If we suspect a linear relationship, we
            may use the Pearson correlation coefficient, which is the same for
            all three datasets below. Here, the visualization tells us a lot
            more information about the relationships between the variables:
          </p>
          <div class="center">
            <img src="media/stats.png" alt="summary statistics static" />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="pearson-correlation-coefficient">
          <p>
            The Pearson correlation coefficient measures
            <em>linear</em> correlation, so if we don't visualize our data, then
            we have another problem: a high correlation (close in absolute value
            to 1) does not mean the relationship is actually linear. Without a
            visualization to contextualize the summary statistics, we do not
            have an accurate understanding of the data.
          </p>
        </section>
        <section id="anscombe-quartet">
          <p>
            For example, all four datasets in
            <b>Anscombe's Quartet</b> (constructed in 1973) have strong
            correlations, but only <b>I</b> and <b>III</b> have linear
            relationships:
          </p>
          <div class="center">
            <img src="media/anscombe.png" alt="Anscombe's Quartet" />
          </div>
          <div style="text-align: center">
            <small
              >This visual was created by Stefanie Molin using the Anscombe's
              Quartet dataset as provided in
              <a
                href="https://github.com/mwaskom/seaborn"
                target="_blank"
                rel="noopener noreferrer"
                >seaborn.</a
              ></small
            >
          </div>
        </section>
        <section id="visualization-is-essential">
          <h3 class="center">
            Visualization is an essential part of any data analysis.
          </h3>
        </section>
        <section id="hypothesis-is-a-liability">
          <p>
            In their 2020 paper,
            <a
              href="https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02133-w"
              target="_blank"
              rel="noopener noreferrer"
              ><em>A hypothesis is a liability</em></a
            >, researchers Yanai and Lercher argue that
            <b
              >simply approaching a dataset with a hypothesis may limit the
              thoroughness to which the data is explored</b
            >.
          </p>
          <p class="fragment">Let's take a look at their experiment.</p>
        </section>
        <section id="hypothesis-is-a-liability-experiment">
          <h4>The experiment</h4>
          <p>
            Students in a statistical data analysis course were split into two
            groups. One group was given the open-ended task of exploring the
            data, while the other group was instructed to test the following
            hypotheses:
          </p>
          <ol style="padding-left: 20px">
            <li>
              There is a difference in the mean number of steps between women
              and men.
            </li>
            <li>
              The correlation coefficient between steps and BMI is negative for
              women.
            </li>
            <li>
              The correlation coefficient between steps and BMI is positive for
              men.
            </li>
          </ol>
          <div style="text-align: right">
            <p style="font-size: small">
              <a
                href="https://doi.org/10.1101/2020.07.30.228916"
                rel="noopener noreferrer"
                target="_blank"
              >
                (Yanai & Lercher, 2020)
              </a>
            </p>
          </div>
        </section>
        <section id="hypothesis-is-a-liability-experiment-dataset">
          <p>Here's what that dataset looked like:</p>
          <div class="center">
            <img
              width="60%"
              alt="Figure 1 from 'A hypothesis is a liability' by Itai Yanai & Martin Lercher"
              src="https://media.springernature.com/lw685/springer-static/image/art%3A10.1186%2Fs13059-020-02133-w/MediaObjects/13059_2020_2133_Fig1_HTML.png?as=webp"
            />
          </div>

          <div style="text-align: center">
            <small
              >Figure 1 from
              <em
                ><a
                  href="https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02133-w"
                  target="_blank"
                  rel="noopener noreferrer"
                  >A hypothesis is a liability</a
                ></em
              >
              by Itai Yanai & Martin Lercher (<a
                href="http://creativecommons.org/licenses/by/4.0/"
                target="_blank"
                rel="noopener noreferrer"
                >Creative Commons Attribution 4.0 International License</a
              >).</small
            >
          </div>
        </section>
        <section id="how-can-we-encourage-thoroughness">
          <h4 class="center">
            How can we encourage students and practitioners alike to be more
            thorough in their analyses?
          </h4>
        </section>
        <section id="teaching-aids">
          <h3 class="center">Create more memorable teaching aids</h3>
        </section>
        <section id="datasaurus-dozen">
          <p>
            In 2017, Autodesk researchers created the <b>Datasaurus Dozen</b>,
            building upon the idea of Anscombe's Quartet to make a more
            impactful example:
          </p>
          <div class="center">
            <img
              src="media/datasaurus.png"
              alt="Datasaurus Dozen"
              width="500px"
              style="margin: -10px auto"
            />
            <br />
            <div style="margin: auto 5%">
              <small
                >This visual was created by Stefanie Molin using the Datasaurus
                Dozen dataset as provided by
                <a
                  href="https://github.com/jmatejka/same-stats-different-graphs"
                  target="_blank"
                  rel="noopener noreferrer"
                  >jmatejka/same-stats-different-graphs.</a
                ></small
              >
            </div>
          </div>
        </section>
        <section id="animation">
          <p>
            They also employed animation, which is even more impactful. Every
            shape as we transition between the Datasaurus and the circle shares
            the same summary statistics:
          </p>
          <div class="center">
            <img
              src="media/dino_to_circle.gif"
              alt="Datasaurus to circle (Data Morph)"
            />
            <br />
            <small
              >This visual was created by Stefanie Molin using Data
              Morph.</small
            >
          </div>
        </section>
        <section id="but-no-we-have-a-problem">
          <p class="center">But, now we have a new problem...</p>
        </section>
        <section id="what-is-special-about-the-datasaurus">
          <h3 class="center">What's so special about the Datasaurus?</h3>
          <h4 class="center fragment">NOTHING!</h4>
        </section>
        <section id="why-i-built-data-morph">
          <p>
            Since there was no easy way to do this for arbitrary datasets,
            people assumed that this capability is a property of the Datasaurus
            and were shocked to see this work with other shapes. The more ways
            people see this and the more memorable they are, the better this
            concept will stick &ndash; repetition is key to learning.
          </p>
          <p class="fragment">
            This is why I built
            <a
              href="https://stefaniemolin.com/data-morph/stable/index.html"
              target="_blank"
              rel="noopener noreferrer"
              >Data Morph</a
            >.
          </p>
        </section>
        <section id="Data-Morph-education-tool">
          <h3>Data Morph is an educational tool</h3>
          <p>It addresses the limitations of previous methods:</p>
          <ul>
            <li>
              installable Python package that can be used without hacking at the
              codebase
            </li>
            <li>animated results are provided automatically</li>
            <li>possible to use additional datasets (built-in and custom)</li>
            <li>
              people can experiment with their own datasets and various target
              shapes
            </li>
            <li>the number of possible examples is no longer frozen</li>
          </ul>
        </section>
        <section id="introducing-data-morph">
          <h2>Data Morph (2023)</h2>

          <div class="center">
            <img
              src="media/Python_to_heart.gif"
              alt="morphing the Python logo into a heart"
            />
          </div>

          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="example-code">
          <p>Here's the code to create that example:</p>
          <pre>
            <code data-trim class="language-shell">
              $ python -m pip install data-morph-ai
              $ data-morph --start-shape Python --target-shape heart
            </code>
          </pre>
        </section>
        <section id="behind-the-scenes">
          <p>Here's what's going on behind the scenes:</p>
          <pre>
            <code data-trim class="language-python">
              from data_morph.data.loader import DataLoader
              from data_morph.morpher import DataMorpher
              from data_morph.shapes.factory import ShapeFactory


              dataset = DataLoader.load_dataset('Python')
              target_shape = ShapeFactory(dataset).generate_shape('heart')

              morpher = DataMorpher(decimals=2, in_notebook=False)
              _ = morpher.morph(dataset, target_shape)
            </code>
          </pre>
        </section>
        <section id="how-it-works">
          <h2 class="center">A high-level overview of how it works</h2>
        </section>
        <section id="select-a-starting-dataset">
          <h3>1. Select a starting dataset</h3>
          <pre>
            <code data-trim class="language-python hide-line-numbers" data-line-numbers="1,6">
              from data_morph.data.loader import DataLoader
              from data_morph.morpher import DataMorpher
              from data_morph.shapes.factory import ShapeFactory


              dataset = DataLoader.load_dataset('Python')
              target_shape = ShapeFactory(dataset).generate_shape('heart')

              morpher = DataMorpher(decimals=2, in_notebook=False)
              _ = morpher.morph(dataset, target_shape)
            </code>
          </pre>
        </section>
        <section id="bounds">
          <h4>Automatically-calculated bounds</h4>
          <p>
            Data Morph provides the <code>Dataset</code> class that wraps the
            data (stored as a <code>pandas.DataFrame</code>) with information
            about bounds for the data, the morphing process, and plotting. This
            allows for the use of arbitrary datasets by providing a way to
            calculate target shapes &ndash; no more hardcoded values.
          </p>
          <div class="center">
            <img
              src="media/bounds.png"
              alt="automatically-calculated bounds"
              width="400px"
            />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="built-in-datasets">
          <h4>Built-in datasets</h4>
          <p>
            To spark creativity, there are built-in datasets to inspire you:
          </p>
          <div style="text-align: center">
            <img
              src="media/available_datasets.png"
              alt="built-in datasets"
              width="450px"
            />
            <br />
            <small
              >Note: Currently displaying what's available as of the v0.2.0
              release. All logos are used with
              <a
                href="https://stefaniemolin.com/data-morph/stable/api/data_morph.data.loader.html#id1"
                target="_blank"
                rel="noopener noreferrer"
                >permission</a
              >.</small
            >
          </div>
        </section>
        <section id="generate-a-target-shape">
          <h3>2. Generate a target shape based on the dataset</h3>
          <pre>
            <code data-trim class="language-python hide-line-numbers" data-line-numbers="3,7">
              from data_morph.data.loader import DataLoader
              from data_morph.morpher import DataMorpher
              from data_morph.shapes.factory import ShapeFactory


              dataset = DataLoader.load_dataset('Python')
              target_shape = ShapeFactory(dataset).generate_shape('heart')

              morpher = DataMorpher(decimals=2, in_notebook=False)
              _ = morpher.morph(dataset, target_shape)
            </code>
          </pre>
        </section>
        <section id="scaling-and-translating-target-shapes">
          <h4>Scaling and translating target shapes</h4>
          <p>
            Depending on the target shape, bounds and/or statistics from the
            dataset are used to generate a custom target shape for the dataset
            to morph into.
          </p>
          <div class="center">
            <img
              src="media/fitting_shapes.png"
              alt="shapes are calculated based on input data"
              width="80%"
            />
          </div>
          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="built-in-target-shapes">
          <h4>Built-in target shapes</h4>
          <p>The following target shapes are currently available:</p>
          <div class="center">
            <img
              src="media/available_shapes.png"
              alt="built-in target shapes"
              width="60%"
            />
            <br />
            <small
              >Note: Currently displaying what's available as of the v0.2.0
              release.</small
            >
          </div>
        </section>
        <section id="shape-class-hierarchy">
          <h4>The <code>Shape</code> class hierarchy</h4>
          <p>
            In Data Morph, shapes are structured as a hierarchy of classes,
            which must provide a <code>distance()</code> method. This makes them
            interchangeable in the morphing logic.
          </p>
          <div class="center">
            <img src="media/uml/shapes_uml.svg" alt="hierarchy of shapes" />
            <br />
            <small
              >Note: The ... boxes represent classes omitted for space.</small
            >
          </div>
        </section>
        <section id="morph">
          <h3>3. Morph the dataset into the target shape</h3>
          <pre>
            <code data-trim class="language-python hide-line-numbers" data-line-numbers="2,9-10">
              from data_morph.data.loader import DataLoader
              from data_morph.morpher import DataMorpher
              from data_morph.shapes.factory import ShapeFactory


              dataset = DataLoader.load_dataset('Python')
              target_shape = ShapeFactory(dataset).generate_shape('heart')

              morpher = DataMorpher(decimals=2, in_notebook=False)
              _ = morpher.morph(dataset, target_shape)
            </code>
          </pre>
        </section>
        <section id="simulated-annealing">
          <h4>Simulated annealing</h4>
          <p>
            A point is selected at random (blue) and moved a small, random
            amount to a new location (red), preserving summary statistics. This
            part of the codebase comes from the Autodesk research and is mostly
            unchanged:
          </p>
          <div class="center">
            <img
              src="media/simulated_annealing.gif"
              alt="example point movement"
              width="80%"
            />
          </div>

          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="avoiding-local-optima">
          <h4>Avoiding local optima</h4>
          <p>
            Sometimes, the algorithm will move a point away from the target
            shape, while still preserving summary statistics. This helps to
            avoid getting stuck:
          </p>
          <div class="center">
            <img
              src="media/avoiding_local_optima.gif"
              alt="example point movement"
              width="80%"
            />
          </div>

          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <small class="footnote"
              >The Python logo is a
              <a
                href="https://www.python.org/psf/trademarks/"
                target="_blank"
                rel="noopener noreferrer"
                >trademark of the Python Software Foundation (PSF)</a
              >, used with permission from the Foundation.</small
            >
          </div>
        </section>
        <section id="temperature">
          <p>
            The likelihood of doing this decreases over time and is governed by
            the <b>temperature</b> of the simulated annealing process:
          </p>
          <div class="center">
            <img
              src="media/temperature_over_time.png"
              alt="temperature over time"
            />
            <br />
            <small
              >The temperature falls to zero as we near the final iterations,
              meaning we become more strict about moving toward the target shape
              to finalize the output.</small
            >
          </div>
        </section>
        <section id="decreasing-point-movement-over-time">
          <h4>Decreasing point movement over time</h4>
          <p>
            The maximum amount that a point can move at a given iteration
            decreases over time for a better visual effect. This makes points
            move faster when the morphing starts and slow down as we approach
            the target shape:
          </p>
          <div class="center">
            <img
              src="media/Python_to_heart_forward_only.gif"
              alt="morphing the Python logo into a heart"
            />
          </div>

          <hr align="left" style="width: 33%; margin-bottom: 5px" />
          <div class="footnotes">
            <div>
              <small class="footnote"
                >The Python logo is a
                <a
                  href="https://www.python.org/psf/trademarks/"
                  target="_blank"
                  rel="noopener noreferrer"
                  >trademark of the Python Software Foundation (PSF)</a
                >, used with permission from the Foundation.</small
              >
            </div>
            <div style="margin-top: -35px">
              <small class="footnote"
                >Varying point movement over time is not part of the Autodesk
                implementation.</small
              >
            </div>
          </div>
          <aside class="notes">
            <p>
              In simulated annealing, we are decreasing temperature over time,
              so we can think of the earlier iterations as matter in a gaseous
              state (the points are moving fast). As the temperature decreases,
              we transition into liquid and eventually solid state, with the
              point movement decreasing.
            </p>
          </aside>
        </section>
        <section id="point-movement-over-time">
          <p>
            Unlike temperature, we don't allow this value to fall to zero, since
            we don't want to halt movement:
          </p>
          <div class="center">
            <img
              src="media/maximum_movement_over_time.png"
              alt="easing movement over time"
            />
            <br />
            <small
              >Maximum point movement decreases over time just as temperature
              does.</small
            >
          </div>
        </section>
        <section id="limitations-and-areas-for-future-work">
          <h2 class="center">Limitations and areas for future work</h2>
        </section>
        <section id="bald-spots">
          <h3>&ldquo;Bald spots&rdquo;</h3>
          <p>
            How do we encourage points to fill out the target shape and not just
            clump together?
          </p>
          <div class="center">
            <img src="media/bald_spots.png" alt="bald spots limitation" />
          </div>
        </section>
        <section id="morphing-direction">
          <h3>Morphing direction</h3>
          <p>
            Currently, we can only morph from dataset to shape (and shape to
            dataset by playing the animation in reverse). I would like to
            support dataset to dataset and shape to shape morphing, but there
            are challenges to both:
          </p>
          <table style="font-size: 0.65em">
            <thead>
              <tr>
                <th>Goal</th>
                <th>Challenges</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>shape&rarr;shape</td>
                <td>
                  determining the initial sizing and possibly aligning scale
                  across the shapes, and solving the bald spot problem
                </td>
              </tr>
              <tr>
                <td>dataset&rarr;dataset</td>
                <td>
                  defining a distance metric, determining scale and position of
                  target, and solving the bald spot problem
                </td>
              </tr>
            </tbody>
          </table>
        </section>
        <section id="speed">
          <h3>Speed</h3>
          <p>
            The algorithm from the original research is largely untouched and
            parts of it could potentially be vectorized to speed up the morphing
            process.
          </p>
        </section>
        <section id="data-scale">
          <h3>Data scale affects morphing time</h3>
          <p>
            Smaller values (left subplot) morph in fewer iterations than larger
            values (right subplot) since we only move small amounts at a time:
          </p>
          <div class="center">
            <img src="media/scale.png" alt="scale" />
            <br />
            <small
              >Converting each of these into the circle shape takes ~25K
              iterations for the half-scale, ~50K iterations for the actual
              scale, and ~77.5K iterations for the scaled-up version.</small
            >
          </div>
        </section>
        <section id="convergence">
          <h3>Convergence</h3>
          <ul>
            <li>
              <b>Currently</b>: The user specifies the number of iterations to
              run. For datasets with small values, convergence might happen
              earlier; for datasets with larger values, this might happen well
              after this number of iterations.
            </li>
            <li>
              <b>Goal</b>: The user would specify the maximum number of
              iterations and the algorithm would stop early if the dataset had
              converged to the target shape.
            </li>
          </ul>
        </section>
        <section id="lessons-learned">
          <h2 class="center">Lessons learned and challenges faced</h2>
        </section>
        <section id="repeating-research">
          <h3>Repeating research is hard</h3>
          <p>
            My first step was to use the
            <a
              href="https://github.com/jmatejka/same-stats-different-graphs"
              target="_blank"
              rel="noopener noreferrer"
              >Autodesk researchers' code</a
            >
            to recreate the conversion of the Datasaurus into a circle and
            figure out how the code worked.
          </p>
          <div class="fragment">
            <p style="padding-top: 10px">Challenges at this stage:</p>
            <ul>
              <li>Limited or no code documentation</li>
              <li>Partial codebase with unused variables and functions</li>
              <li>Generic variable names</li>
            </ul>
          </div>
          <div class="fragment">
            <p style="padding-top: 5px">TIME TAKEN: 4 hours</p>
          </div>
        </section>
        <section id="extending-research">
          <h3>Extending research is harder</h3>
          <p>
            From there, I tried to get it to work with a panda-shaped dataset,
            reworked to have similar statistics to the Datasaurus.
          </p>
          <div class="fragment">
            <p style="padding-top: 10px">Challenges at this stage:</p>
            <ul>
              <li>Limited or no code documentation</li>
              <li>Partial codebase with unused variables and functions</li>
              <li>Hardcoded values (some of which were related to the data)</li>
            </ul>
          </div>
          <div class="fragment">
            <p style="padding-top: 5px">TIME TAKEN: 1.75 days</p>
          </div>
        </section>
        <section id="building-a-package">
          <h3>Building and distributing a package is a lot of work</h3>
          <p>
            Once I got the transformation working with the panda (my original
            goal), I realized this would be a helpful teaching tool and decided
            to make a package.
          </p>
          <div class="fragment">
            <p style="padding-top: 10px">Challenges at this stage:</p>
            <ul>
              <li>Purging unused variables and functions</li>
              <li>Refactoring a monolithic codebase</li>
              <li>
                Writing a pre-commit hook to validate numpydoc-style docstrings
                (<a
                  href="https://github.com/numpy/numpydoc/pull/454"
                  target="_blank"
                  rel="noopener noreferrer"
                  >PR 454</a
                >)
              </li>
              <li>Building and hosting documentation</li>
              <li>Creating a robust test suite from scratch</li>
              <li>Publishing to PyPI and conda-forge</li>
              <li>Automating workflows with GitHub Actions</li>
            </ul>
          </div>
          <div class="fragment">
            <p style="padding-top: 5px">TIME TAKEN: 2 months (v0.1.0)</p>
          </div>
        </section>
        <section id="upstream-issues">
          <h3>Side note: Don't completely trust the docs</h3>
          <p>Here are some cases I bumped into while building Data Morph:</p>
          <ul>
            <li>
              Error in version switcher config example for pydata-sphinx-theme
              (<a
                href="https://github.com/pydata/pydata-sphinx-theme/pull/1279"
                target="_blank"
                rel="noopener noreferrer"
                >PR 1279</a
              >).
            </li>
            <li>
              Unable to report code coverage broken out by package and tests in
              PR using codecov configuration like Matplotlib's (<a
                href="https://github.com/matplotlib/matplotlib/pull/25698"
                target="_blank"
                rel="noopener noreferrer"
                >PR 25698</a
              >).
            </li>
          </ul>
        </section>
        <section id="helpful-resources">
          <h2>Helpful resources</h2>
          <ul>
            <li>
              <a
                href="https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html"
                target="_blank"
                rel="noopener noreferrer"
                >Configuring setuptools using pyproject.toml files</a
              >
              &ndash; Python Packaging Authority
            </li>
            <li>
              <a
                href="https://packaging.python.org/en/latest/tutorials/packaging-projects/"
                target="_blank"
                rel="noopener noreferrer"
                >Packaging Python Packages</a
              >
              &ndash; Python Packaging Authority
            </li>
            <li>
              <a
                href="https://olgarithms.github.io/sphinx-tutorial/docs/7-hosting-on-github-pages.html"
                target="_blank"
                rel="noopener noreferrer"
                >Building and hosting documentation on GitHub Pages</a
              >
              &ndash; Aya Elsayed and Olga Matoula
            </li>
            <li>
              <a
                href="https://hackmd.io/ElBrRQ6rT4K_dfzjY6pAFQ#3-Time-to-pack-%F0%9F%93%A6"
                target="_blank"
                rel="noopener noreferrer"
                >Python Packaging Tutorial: The Conda Way</a
              >
              &ndash; Bianca Henderson, Mahe Iram Khan, Valerio Maggio, and Dave
              Clements
            </li>
            <li>
              <a
                href="https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python"
                target="_blank"
                rel="noopener noreferrer"
                >Building and testing Python</a
              >
              &ndash; GitHub Actions docs
            </li>
          </ul>
        </section>
        <section id="closing-remarks">
          <h2>Closing remarks</h2>
          <ul>
            <li>
              Summary statistics alone aren't enough to describe your data.
            </li>
            <li>
              Visualization is essential, but a single plot won't suffice.
            </li>
            <li>Try out Data Morph!</li>
            <ul style="font-size: 0.9em">
              <li><code>python -m pip install data-morph-ai</code></li>
              <li><code>conda install -c conda-forge data-morph-ai</code></li>
              <li>
                docs:
                <a
                  href="https://stefaniemolin.com/data-morph/"
                  target="_blank"
                  rel="noopener noreferrer"
                  >stefaniemolin.com/data-morph</a
                >
              </li>
              <li>
                repo:
                <a
                  href="https://github.com/stefmolin/data-morph"
                  target="_blank"
                  rel="noopener noreferrer"
                  >github.com/stefmolin/data-morph</a
                >
              </li>
              <li>
                classroom activities:
                <a
                  href="https://github.com/stefmolin/data-morph/#data-morph-in-the-classroom"
                  target="_blank"
                  rel="noopener noreferrer"
                  >github.com/stefmolin/data-morph/#data-morph-in-the-classroom</a
                >
              </li>
            </ul>
          </ul>
        </section>
        <section id="references">
          <h2>References</h2>
          <ul class="references">
            <li>
              Anscombe, F.J. (1973). Graphs in Statistical Analysis.
              <em>The American Statistician 27</em>, 1, 17&dash;21.
              <a
                href="https://www.tandfonline.com/doi/abs/10.1080/00031305.1973.10478966"
                target="_blank"
                rel="noopener noreferrer"
              >
                https://www.tandfonline.com/doi/abs/10.1080/00031305.1973.10478966</a
              >
            </li>
            <li>
              Matejka, J., Fitzmaurice, G. (2017). Same Stats, Different Graphs:
              Generating Datasets with Varied Appearance and Identical
              Statistics through Simulated Annealing. In Proceedings of the 2017
              CHI Conference on Human Factors in Computing Systems (CHI '17).
              Association for Computing Machinery, New York, NY, USA,
              1290&dash;1294.
              <a
                href="https://doi.org/10.1145/3025453.3025912"
                target="_blank"
                rel="noopener noreferrer"
                >https://doi.org/10.1145/3025453.3025912</a
              >
            </li>
            <li>
              Yanai, I., Lercher, M. (2020). A hypothesis is a liability.
              <em>Genome Biol 21</em>, 231.
              <a
                href="https://doi.org/10.1186/s13059-020-02133-w"
                target="_blank"
                rel="noopener noreferrer"
                >https://doi.org/10.1186/s13059-020-02133-w</a
              >
            </li>
            <li>
              Yanai, I, Lercher, M. (2020). Selective attention in
              hypothesis-driven data analysis. BioRxiv.
              <a
                href="https://doi.org/10.1101/2020.07.30.228916"
                target="_blank"
                rel="noopener noreferrer"
                >https://doi.org/10.1101/2020.07.30.228916</a
              >
            </li>
          </ul>
        </section>
        <section id="thank-you">
          <h2>Thank you!</h2>
          <p style="font-size: 0.65em">
            <em
              >I hope you enjoyed the session. You can follow my work on these
              platforms:</em
            >
          </p>
          <div
            style="
              display: flex;
              justify-content: space-evenly;
              align-items: center;
            "
          >
            <div style="text-align: center; width: 30%">
              <img
                alt="Please visit https://stefaniemolin.com/feedback to provide your feedback on this session."
                title="QR code to submit feedback (leads to https://stefaniemolin.com/feedback)"
                class="qr-code"
                src="https://raw.githubusercontent.com/stefmolin/pandas-workshop/main/media/qr-code.png"
              />
            </div>
            <div style="font-size: 0.85em">
              <div style="display: flex">
                <i
                  class="fa fa-globe fa-fw"
                  style="padding-right: 4px; margin: auto 0"
                ></i>
                <a href="https://stefaniemolin.com" rel="noopener noreferrer">
                  stefaniemolin.com
                </a>
              </div>
              <div style="display: flex">
                <i
                  class="fab fa-github fa-fw"
                  style="padding-right: 4px; margin: auto 0"
                ></i>
                <a
                  href="https://github.com/stefmolin"
                  rel="noopener noreferrer"
                  target="_blank"
                >
                  github.com/stefmolin
                </a>
              </div>
              <div style="display: flex">
                <i
                  class="fab fa-twitter fa-fw"
                  style="padding-right: 4px; margin: auto 0"
                ></i>
                <a
                  href="https://twitter.com/StefanieMolin"
                  rel="noopener noreferrer"
                  target="_blank"
                >
                  twitter.com/StefanieMolin
                </a>
              </div>
              <div style="display: flex">
                <i
                  class="fab fa-linkedin fa-fw"
                  style="padding-right: 4px; margin: auto 0"
                ></i>
                <a
                  href="https://linkedin.com/in/stefanie-molin"
                  rel="noopener noreferrer"
                  target="_blank"
                >
                  linkedin.com/in/stefanie-molin
                </a>
              </div>
            </div>
          </div>
        </section>
      </div>
    </div>

    <script src="https://unpkg.com/reveal.js@5.1.0/dist/reveal.js"></script>
    <script
      src="https://code.jquery.com/jquery-3.7.1.slim.min.js"
      integrity="sha256-kmHvs0B+OpCW5GVHUNjv9rOmY0IvSIRcf7zGUDTDQM8="
      crossorigin="anonymous"
    ></script>

    <!--syntax highlighting in code snippets-->
    <script src="https://unpkg.com/reveal.js@5.1.0/plugin/highlight/highlight.js"></script>
    <script src="https://unpkg.com/reveal.js@5.1.0/plugin/notes/notes.js"></script>
    <script>
      // More info about initialization & config:
      // - https://revealjs.com/initialization/
      // - https://revealjs.com/config/
      Reveal.initialize({
        controls: false,
        hash: true,

        // Learn about plugins: https://revealjs.com/plugins/
        plugins: [RevealHighlight, RevealNotes],
      });

      // make sure the footer shows up on the PDF version and the regular slides
      const footer = $("#footer-info").html();
      Reveal.addEventListener("ready", function (event) {
        $(".slide-background").append(footer);
      });
      $("div.reveal").append(footer);
    </script>
  </body>
</html>