anthropics · itsaark · Mar 20, 2024
diff --git a/misc/building_moderation_filter.ipynb b/misc/building_moderation_filter.ipynb
@@ -54,34 +54,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "from anthropic import Anthropic\n",
+    "\n",
     "client = Anthropic()\n",
     "MODEL_NAME = \"claude-3-haiku-20240307\"\n",
     "\n",
     "def moderate_text(user_text, guidelines):\n",
     "    prompt_template = \"\"\"\n",
     "    You are a content moderation expert tasked with categorizing user-generated text based on the following guidelines:\n",
-    "\n",
+    "    \n",
+    "    <guidelines>\n",
     "    {guidelines}\n",
-    "\n",
-    "    Here is the user-generated text to categorize:\n",
+    "    </guidelines>\n",
+    "    \n",
+    "    Based on the above guidelines, classify the user text as either ALLOW or BLOCK. Return nothing else.\n",
+    "    \"\"\"\n",
+    "    user_content_template = \"\"\"\n",
     "    <user_text>{user_text}</user_text>\n",
-    "\n",
-    "    Based on the guidelines above, classify this text as either ALLOW or BLOCK. Return nothing else.\n",
     "    \"\"\"\n",
     "\n",
-    "    # Format the prompt with the user text\n",
-    "    prompt = prompt_template.format(user_text=user_text, guidelines=guidelines)\n",
+    "    # Format the system prompt with the guidelines\n",
+    "    system_prompt = prompt_template.format(guidelines=guidelines)\n",
+    "\n",
+    "    # Format the user content with the user text\n",
+    "    user_content = user_content_template.format(user_text=user_text)\n",
     "\n",
     "    # Send the prompt to Claude and get the response\n",
     "    response = client.messages.create(\n",
     "        model=MODEL_NAME,\n",
     "        max_tokens=10,\n",
-    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+    "        system=system_prompt,\n",
+    "        messages=[{\"role\": \"user\", \"content\": user_content}]\n",
     "    ).content[0].text\n",
     "\n",
     "    return response"
@@ -96,7 +103,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -153,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -217,15 +224,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "<thinking>\n",
-      "The post appears to be promoting a band rather than discussing rollercoasters, theme parks, or the amusement industry. This falls under the \"spam, advertisements, or self-promotion\" category, which is grounds for blocking the post.\n",
+      "The post is self-promoting a band, which is not related to rollercoasters, theme parks, or the amusement industry. It also appears to be an advertisement or spam. These aspects would likely be serious enough to categorize this post as BLOCK.\n",
       "</thinking>\n",
       "\n",
       "<output>BLOCK</output>\n"
@@ -247,17 +254,20 @@
     "- Some mild profanity or crude language, as long as it is not directed at individuals\n",
     "\n",
     "First, inside of <thinking> tags, identify any potentially concerning aspects of the post based on the guidelines below and consider whether those aspects are serious enough to block the post or not. Finally, classify this text as either ALLOW or BLOCK inside <output> tags. Return nothing else.\n",
+    "'''\n",
     "\n",
-    "Given those instructions, here is the post to categorize:\n",
+    "user_content_template = '''Here is the post to categorize:\n",
     "\n",
-    "<user_post>{user_post}</user_post>'''\n",
+    "<user_post>{user_post}</user_post>\n",
+    "'''\n",
     "\n",
     "user_post = \"Introducing my new band - Coaster Shredders. Check us out on YouTube!!\"\n",
     "\n",
     "response = client.messages.create(\n",
     "        model=MODEL_NAME,\n",
     "        max_tokens=1000,\n",
-    "        messages=[{\"role\": \"user\", \"content\": cot_prompt.format(user_post=user_post)}]\n",
+    "        system = cot_prompt,\n",
+    "        messages=[{\"role\": \"user\", \"content\": user_content_template.format(user_post=user_post)}]\n",
     "    ).content[0].text\n",
     "\n",
     "print(response)"
@@ -273,7 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -313,17 +323,20 @@
     "Category: ALLOW\n",
     "</examples>\n",
     "\n",
-    "Given those examples, here is the user-generated text to categorize:\n",
-    "<user_text>{user_text}</user_text>\n",
+    "ased on the guidelines above, classify this user text as either ALLOW or BLOCK. Return nothing else.\n",
+    "'''\n",
     "\n",
-    "Based on the guidelines above, classify this text as either ALLOW or BLOCK. Return nothing else.'''\n",
+    "user_content_template = '''Here is the user-generated text to categorize:\n",
+    "<user_text>{user_text}</user_text>\n",
+    "'''\n",
     "\n",
     "user_post = \"Why Boomerang Coasters Ain't It (Don't @ Me)\"\n",
     "\n",
     "response = client.messages.create(\n",
     "        model=MODEL_NAME,\n",
     "        max_tokens=1000,\n",
-    "        messages=[{\"role\": \"user\", \"content\": examples_prompt.format(user_text=user_post)}]\n",
+    "        system=examples_prompt,\n",
+    "        messages=[{\"role\": \"user\", \"content\": user_content_template.format(user_text=user_post)}]\n",
     "    ).content[0].text\n",
     "\n",
     "print(response)"
@@ -346,7 +359,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,