Comparison with SQL#

Since many potential pandas users have some familiarity with SQL, this page is meant to provide some examples of how various SQL operations would be performed using pandas.

If you’re new to pandas, you might want to first read through 10 Minutes to pandas to familiarize yourself with the library.

As is customary, we import pandas and NumPy as follows:

In [1]: import pandas as pd

In [2]: import numpy as np

Most of the examples will utilize the tips dataset found within pandas tests. We’ll read the data into a DataFrame called tips and assume we have a database table of the same name and structure.

In [3]: url = (
   ...:     "https://raw.githubusercontent.com/pandas-dev"
   ...:     "/pandas/main/pandas/tests/io/data/csv/tips.csv"
   ...: )
   ...: 

In [4]: tips = pd.read_csv(url)
---------------------------------------------------------------------------
ConnectionRefusedError                    Traceback (most recent call last)
File /usr/lib/python3.13/urllib/request.py:1319, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1318 try:
-> 1319     h.request(req.get_method(), req.selector, req.data, headers,
   1320               encode_chunked=req.has_header('Transfer-encoding'))
   1321 except OSError as err: # timeout error

File /usr/lib/python3.13/http/client.py:1338, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
   1337 """Send a complete request to the server."""
-> 1338 self._send_request(method, url, body, headers, encode_chunked)

File /usr/lib/python3.13/http/client.py:1384, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
   1383     body = _encode(body, 'body')
-> 1384 self.endheaders(body, encode_chunked=encode_chunked)

File /usr/lib/python3.13/http/client.py:1333, in HTTPConnection.endheaders(self, message_body, encode_chunked)
   1332     raise CannotSendHeader()
-> 1333 self._send_output(message_body, encode_chunked=encode_chunked)

File /usr/lib/python3.13/http/client.py:1093, in HTTPConnection._send_output(self, message_body, encode_chunked)
   1092 del self._buffer[:]
-> 1093 self.send(msg)
   1095 if message_body is not None:
   1096 
   1097     # create a consistent interface to message_body

File /usr/lib/python3.13/http/client.py:1037, in HTTPConnection.send(self, data)
   1036 if self.auto_open:
-> 1037     self.connect()
   1038 else:

File /usr/lib/python3.13/http/client.py:1472, in HTTPSConnection.connect(self)
   1470 "Connect to a host on a given (SSL) port."
-> 1472 super().connect()
   1474 if self._tunnel_host:

File /usr/lib/python3.13/http/client.py:1003, in HTTPConnection.connect(self)
   1002 sys.audit("http.client.connect", self, self.host, self.port)
-> 1003 self.sock = self._create_connection(
   1004     (self.host,self.port), self.timeout, self.source_address)
   1005 # Might fail in OSs that don't implement TCP_NODELAY

File /usr/lib/python3.13/socket.py:864, in create_connection(address, timeout, source_address, all_errors)
    863 if not all_errors:
--> 864     raise exceptions[0]
    865 raise ExceptionGroup("create_connection failed", exceptions)

File /usr/lib/python3.13/socket.py:849, in create_connection(address, timeout, source_address, all_errors)
    848     sock.bind(source_address)
--> 849 sock.connect(sa)
    850 # Break explicitly a reference cycle

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
Cell In[4], line 1
----> 1 tips = pd.read_csv(url)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File /usr/lib/python3/dist-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File /usr/lib/python3/dist-packages/pandas/io/common.py:384, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    382 # assuming storage_options is to be interpreted as headers
    383 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 384 with urlopen(req_info) as req:
    385     content_encoding = req.headers.get("Content-Encoding", None)
    386     if content_encoding == "gzip":
    387         # Override compression based on Content-Encoding header

File /usr/lib/python3/dist-packages/pandas/io/common.py:289, in urlopen(*args, **kwargs)
    283 """
    284 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    285 the stdlib.
    286 """
    287 import urllib.request
--> 289 return urllib.request.urlopen(*args, **kwargs)

File /usr/lib/python3.13/urllib/request.py:189, in urlopen(url, data, timeout, context)
    187 else:
    188     opener = _opener
--> 189 return opener.open(url, data, timeout)

File /usr/lib/python3.13/urllib/request.py:489, in OpenerDirector.open(self, fullurl, data, timeout)
    486     req = meth(req)
    488 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 489 response = self._open(req, data)
    491 # post-process response
    492 meth_name = protocol+"_response"

File /usr/lib/python3.13/urllib/request.py:506, in OpenerDirector._open(self, req, data)
    503     return result
    505 protocol = req.type
--> 506 result = self._call_chain(self.handle_open, protocol, protocol +
    507                           '_open', req)
    508 if result:
    509     return result

File /usr/lib/python3.13/urllib/request.py:466, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    464 for handler in handlers:
    465     func = getattr(handler, meth_name)
--> 466     result = func(*args)
    467     if result is not None:
    468         return result

File /usr/lib/python3.13/urllib/request.py:1367, in HTTPSHandler.https_open(self, req)
   1366 def https_open(self, req):
-> 1367     return self.do_open(http.client.HTTPSConnection, req,
   1368                         context=self._context)

File /usr/lib/python3.13/urllib/request.py:1322, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1319         h.request(req.get_method(), req.selector, req.data, headers,
   1320                   encode_chunked=req.has_header('Transfer-encoding'))
   1321     except OSError as err: # timeout error
-> 1322         raise URLError(err)
   1323     r = h.getresponse()
   1324 except:

URLError: <urlopen error [Errno 111] Connection refused>

In [5]: tips
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 tips

NameError: name 'tips' is not defined

Copies vs. in place operations#

Most pandas operations return copies of the Series/DataFrame. To make the changes “stick”, you’ll need to either assign to a new variable:

sorted_df = df.sort_values("col1")

or overwrite the original one:

df = df.sort_values("col1")

Note

You will see an inplace=True or copy=False keyword argument available for some methods:

df.replace(5, inplace=True)

There is an active discussion about deprecating and removing inplace and copy for most methods (e.g. dropna) except for a very small subset of methods (including replace). Both keywords won’t be necessary anymore in the context of Copy-on-Write. The proposal can be found here.

SELECT#

In SQL, selection is done using a comma-separated list of columns you’d like to select (or a * to select all columns):

SELECT total_bill, tip, smoker, time
FROM tips;

With pandas, column selection is done by passing a list of column names to your DataFrame:

In [6]: tips[["total_bill", "tip", "smoker", "time"]]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 tips[["total_bill", "tip", "smoker", "time"]]

NameError: name 'tips' is not defined

Calling the DataFrame without the list of column names would display all columns (akin to SQL’s *).

In SQL, you can add a calculated column:

SELECT *, tip/total_bill as tip_rate
FROM tips;

With pandas, you can use the DataFrame.assign() method of a DataFrame to append a new column:

In [7]: tips.assign(tip_rate=tips["tip"] / tips["total_bill"])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 tips.assign(tip_rate=tips["tip"] / tips["total_bill"])

NameError: name 'tips' is not defined

WHERE#

Filtering in SQL is done via a WHERE clause.

SELECT *
FROM tips
WHERE time = 'Dinner';

DataFrames can be filtered in multiple ways; the most intuitive of which is using boolean indexing.

In [8]: tips[tips["total_bill"] > 10]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 tips[tips["total_bill"] > 10]

NameError: name 'tips' is not defined

The above statement is simply passing a Series of True/False objects to the DataFrame, returning all rows with True.

In [9]: is_dinner = tips["time"] == "Dinner"
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 is_dinner = tips["time"] == "Dinner"

NameError: name 'tips' is not defined

In [10]: is_dinner
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 is_dinner

NameError: name 'is_dinner' is not defined

In [11]: is_dinner.value_counts()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 is_dinner.value_counts()

NameError: name 'is_dinner' is not defined

In [12]: tips[is_dinner]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 1
----> 1 tips[is_dinner]

NameError: name 'tips' is not defined

Just like SQL’s OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and & (AND).

Tips of more than $5 at Dinner meals:

SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;
In [13]: tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]

NameError: name 'tips' is not defined

Tips by parties of at least 5 diners OR bill total was more than $45:

SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;
In [14]: tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 1
----> 1 tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]

NameError: name 'tips' is not defined

NULL checking is done using the notna() and isna() methods.

In [15]: frame = pd.DataFrame(
   ....:     {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]}
   ....: )
   ....: 

In [16]: frame
Out[16]: 
  col1 col2
0    A    F
1    B  NaN
2  NaN    G
3    C    H
4    D    I

Assume we have a table of the same structure as our DataFrame above. We can see only the records where col2 IS NULL with the following query:

SELECT *
FROM frame
WHERE col2 IS NULL;
In [17]: frame[frame["col2"].isna()]
Out[17]: 
  col1 col2
1    B  NaN

Getting items where col1 IS NOT NULL can be done with notna().

SELECT *
FROM frame
WHERE col1 IS NOT NULL;
In [18]: frame[frame["col1"].notna()]
Out[18]: 
  col1 col2
0    A    F
1    B  NaN
3    C    H
4    D    I

GROUP BY#

In pandas, SQL’s GROUP BY operations are performed using the similarly named groupby() method. groupby() typically refers to a process where we’d like to split a dataset into groups, apply some function (typically aggregation) , and then combine the groups together.

A common SQL operation would be getting the count of records in each group throughout a dataset. For instance, a query getting us the number of tips left by sex:

SELECT sex, count(*)
FROM tips
GROUP BY sex;
/*
Female     87
Male      157
*/

The pandas equivalent would be:

In [19]: tips.groupby("sex").size()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 tips.groupby("sex").size()

NameError: name 'tips' is not defined

Notice that in the pandas code we used DataFrameGroupBy.size() and not DataFrameGroupBy.count(). This is because DataFrameGroupBy.count() applies the function to each column, returning the number of NOT NULL records within each.

In [20]: tips.groupby("sex").count()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 tips.groupby("sex").count()

NameError: name 'tips' is not defined

Alternatively, we could have applied the DataFrameGroupBy.count() method to an individual column:

In [21]: tips.groupby("sex")["total_bill"].count()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 tips.groupby("sex")["total_bill"].count()

NameError: name 'tips' is not defined

Multiple functions can also be applied at once. For instance, say we’d like to see how tip amount differs by day of the week - DataFrameGroupBy.agg() allows you to pass a dictionary to your grouped DataFrame, indicating which functions to apply to specific columns.

SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;
/*
Fri   2.734737   19
Sat   2.993103   87
Sun   3.255132   76
Thu  2.771452   62
*/
In [22]: tips.groupby("day").agg({"tip": "mean", "day": "size"})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 tips.groupby("day").agg({"tip": "mean", "day": "size"})

NameError: name 'tips' is not defined

Grouping by more than one column is done by passing a list of columns to the groupby() method.

SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;
/*
smoker day
No     Fri      4  2.812500
       Sat     45  3.102889
       Sun     57  3.167895
       Thu    45  2.673778
Yes    Fri     15  2.714000
       Sat     42  2.875476
       Sun     19  3.516842
       Thu    17  3.030000
*/
In [23]: tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]})

NameError: name 'tips' is not defined

JOIN#

JOINs can be performed with join() or merge(). By default, join() will join the DataFrames on their indices. Each method has parameters allowing you to specify the type of join to perform (LEFT, RIGHT, INNER, FULL) or the columns to join on (column names or indices).

Warning

If both key columns contain rows where the key is a null value, those rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results.

In [24]: df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})

In [25]: df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

Assume we have two database tables of the same name and structure as our DataFrames.

Now let’s go over the various types of JOINs.

INNER JOIN#

SELECT *
FROM df1
INNER JOIN df2
  ON df1.key = df2.key;
# merge performs an INNER JOIN by default
In [26]: pd.merge(df1, df2, on="key")
Out[26]: 
  key   value_x   value_y
0   B -0.282863  1.212112
1   D -1.135632 -0.173215
2   D -1.135632  0.119209

merge() also offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index.

In [27]: indexed_df2 = df2.set_index("key")

In [28]: pd.merge(df1, indexed_df2, left_on="key", right_index=True)
Out[28]: 
  key   value_x   value_y
1   B -0.282863  1.212112
3   D -1.135632 -0.173215
3   D -1.135632  0.119209

LEFT OUTER JOIN#

Show all records from df1.

SELECT *
FROM df1
LEFT OUTER JOIN df2
  ON df1.key = df2.key;
In [29]: pd.merge(df1, df2, on="key", how="left")
Out[29]: 
  key   value_x   value_y
0   A  0.469112       NaN
1   B -0.282863  1.212112
2   C -1.509059       NaN
3   D -1.135632 -0.173215
4   D -1.135632  0.119209

RIGHT JOIN#

Show all records from df2.

SELECT *
FROM df1
RIGHT OUTER JOIN df2
  ON df1.key = df2.key;
In [30]: pd.merge(df1, df2, on="key", how="right")
Out[30]: 
  key   value_x   value_y
0   B -0.282863  1.212112
1   D -1.135632 -0.173215
2   D -1.135632  0.119209
3   E       NaN -1.044236

FULL JOIN#

pandas also allows for FULL JOINs, which display both sides of the dataset, whether or not the joined columns find a match. As of writing, FULL JOINs are not supported in all RDBMS (MySQL).

Show all records from both tables.

SELECT *
FROM df1
FULL OUTER JOIN df2
  ON df1.key = df2.key;
In [31]: pd.merge(df1, df2, on="key", how="outer")
Out[31]: 
  key   value_x   value_y
0   A  0.469112       NaN
1   B -0.282863  1.212112
2   C -1.509059       NaN
3   D -1.135632 -0.173215
4   D -1.135632  0.119209
5   E       NaN -1.044236

UNION#

UNION ALL can be performed using concat().

In [32]: df1 = pd.DataFrame(
   ....:     {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)}
   ....: )
   ....: 

In [33]: df2 = pd.DataFrame(
   ....:     {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}
   ....: )
   ....: 
SELECT city, rank
FROM df1
UNION ALL
SELECT city, rank
FROM df2;
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
      Chicago     1
       Boston     4
  Los Angeles     5
*/
In [34]: pd.concat([df1, df2])
Out[34]: 
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
0        Chicago     1
1         Boston     4
2    Los Angeles     5

SQL’s UNION is similar to UNION ALL, however UNION will remove duplicate rows.

SELECT city, rank
FROM df1
UNION
SELECT city, rank
FROM df2;
-- notice that there is only one Chicago record this time
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
       Boston     4
  Los Angeles     5
*/

In pandas, you can use concat() in conjunction with drop_duplicates().

In [35]: pd.concat([df1, df2]).drop_duplicates()
Out[35]: 
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
1         Boston     4
2    Los Angeles     5

LIMIT#

SELECT * FROM tips
LIMIT 10;
In [36]: tips.head(10)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 1
----> 1 tips.head(10)

NameError: name 'tips' is not defined

pandas equivalents for some SQL analytic and aggregate functions#

Top n rows with offset#

-- MySQL
SELECT * FROM tips
ORDER BY tip DESC
LIMIT 10 OFFSET 5;
In [37]: tips.nlargest(10 + 5, columns="tip").tail(10)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 1
----> 1 tips.nlargest(10 + 5, columns="tip").tail(10)

NameError: name 'tips' is not defined

Top n rows per group#

-- Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
  SELECT
    t.*,
    ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
  FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
In [38]: (
   ....:     tips.assign(
   ....:         rn=tips.sort_values(["total_bill"], ascending=False)
   ....:         .groupby(["day"])
   ....:         .cumcount()
   ....:         + 1
   ....:     )
   ....:     .query("rn < 3")
   ....:     .sort_values(["day", "rn"])
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 2
      1 (
----> 2     tips.assign(
      3         rn=tips.sort_values(["total_bill"], ascending=False)
      4         .groupby(["day"])
      5         .cumcount()
      6         + 1
      7     )
      8     .query("rn < 3")
      9     .sort_values(["day", "rn"])
     10 )

NameError: name 'tips' is not defined

the same using rank(method='first') function

In [39]: (
   ....:     tips.assign(
   ....:         rnk=tips.groupby(["day"])["total_bill"].rank(
   ....:             method="first", ascending=False
   ....:         )
   ....:     )
   ....:     .query("rnk < 3")
   ....:     .sort_values(["day", "rnk"])
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 2
      1 (
----> 2     tips.assign(
      3         rnk=tips.groupby(["day"])["total_bill"].rank(
      4             method="first", ascending=False
      5         )
      6     )
      7     .query("rnk < 3")
      8     .sort_values(["day", "rnk"])
      9 )

NameError: name 'tips' is not defined
-- Oracle's RANK() analytic function
SELECT * FROM (
  SELECT
    t.*,
    RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
  FROM tips t
  WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;

Let’s find tips with (rank < 3) per gender group for (tips < 2). Notice that when using rank(method='min') function rnk_min remains the same for the same tip (as Oracle’s RANK() function)

In [40]: (
   ....:     tips[tips["tip"] < 2]
   ....:     .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min"))
   ....:     .query("rnk_min < 3")
   ....:     .sort_values(["sex", "rnk_min"])
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[40], line 2
      1 (
----> 2     tips[tips["tip"] < 2]
      3     .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min"))
      4     .query("rnk_min < 3")
      5     .sort_values(["sex", "rnk_min"])
      6 )

NameError: name 'tips' is not defined

UPDATE#

UPDATE tips
SET tip = tip*2
WHERE tip < 2;
In [41]: tips.loc[tips["tip"] < 2, "tip"] *= 2
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[41], line 1
----> 1 tips.loc[tips["tip"] < 2, "tip"] *= 2

NameError: name 'tips' is not defined

DELETE#

DELETE FROM tips
WHERE tip > 9;

In pandas we select the rows that should remain instead of deleting them:

In [42]: tips = tips.loc[tips["tip"] <= 9]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[42], line 1
----> 1 tips = tips.loc[tips["tip"] <= 9]

NameError: name 'tips' is not defined