Add LLM support with phi-2 via mlx on Apple silicon

cmdevries · Jan 22, 2024 · e4e8680 · e4e8680
1 parent a7b3806
commit e4e8680
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 12 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Markovify
-Fetch text from p tags in URLs and generate text using markov chains with
-transition probabilities learned from the text.
+Fetch text from p tags in URLs and generate text using large language models or
+markov chains with content learned from the text.
 
 ## Usage 
 The program takes a list of URLs to scrape via command line arguments.
@@ -28,3 +28,11 @@ Mix odd topics for more hilarity.
 
     She is using a person chooses to track how to his name be Jesus Day in Iraq
     and to as the law a fashion industry.
+
+Experimental support for generation using the Microsoft phi-2 LLM on Apple
+silicon using MLX.
+
+    $ ./markovify.py --mlx
+
+    The Nikon AF-S DX Nikkor mm f/2.8G ED VR is a full-frame,
+    wide angle prime lens for Nikon DX format DSLR cameras.
diff --git a/markovify.py b/markovify.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 import random
+import re
 import requests
 import sys
 
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 
 class Text(HTMLParser):
     """Extract text from <p> tags inside HTML."""
@@ -162,13 +163,13 @@ def generate_text(bigrams):
         return 'No statistics to available generate text'
     current_word = 'a'
     while current_word[0].islower(): # start with an upper case word
-        current_word = random.choice(bigrams.keys())
+        current_word = random.choice(list(bigrams.keys()))
     maximum = 10000
     text = ''
     for i in range(maximum):
         text += format_word(current_word)
         if current_word not in bigrams:
-            current_word = random.choice(bigrams.keys())
+            current_word = random.choice(list(bigrams.keys()))
             text += format_word(current_word)
         r = random.random()
         cumulative_probability = 0.0
@@ -190,11 +191,11 @@ def remove_broken_chains(bigrams):
     while removed_any:
         removed_any = False
         for countmap in bigrams.values():
-            for current_word in countmap.keys():
+            for current_word in list(countmap.keys()):
                 if current_word not in bigrams:
                     countmap.pop(current_word)
                     removed_any = True
-        for previous_word, countmap in bigrams.items():
+        for previous_word, countmap in list(bigrams.items()):
             if len(countmap) == 0:
                 bigrams.pop(previous_word)
                 removed_any = True
@@ -214,15 +215,45 @@ def process(urls):
     convert_to_probabilities(bigrams)
     print(generate_text(bigrams))
 
+def process_mlx(urls):
+    """process_mlx(list(string)) -> None
+    
+    Prompt the Microsoft Phi-2 LLM with all the text inside <p> tags and
+    generate text.
+    """
+    all_text = ''
+    for url in urls:
+        text, final_url = fetch_text(url)
+        print('FETCHED TEXT FROM: %s\n' % final_url)
+        all_text += text
+    all_text = all_text.replace('\n', '')
+    all_text = re.sub('[^a-zA-Z\.\s]', '', all_text)
+    all_text = re.sub('\s+', ' ', all_text)
+    all_text = re.sub(' \.', '.', all_text)
+    prompt = f'Summarize the following. {all_text}.'
+    from mlx_lm import load, generate
+    model, tokenizer = load('microsoft/phi-2')
+    response = generate(model, tokenizer, max_tokens=2048, prompt=all_text, \
+        verbose=True, temp=0.5)
+
 if __name__ == '__main__':
     if len(sys.argv) > 1 and sys.argv[1].lower() == '--help':
         print('usage: %s [list of urls to learn markov chains from]'
             % sys.argv[0])
+        print('To use the phi-2 LLM on MacBook with MLX:')
+        print('%s --mlx [list of urls to learn from]' % sys.argv[0])
         sys.exit(1)
     pages = []
-    if len(sys.argv) < 2:
-        random_page = 'https://en.wikipedia.org/wiki/Special:Random'
-        pages = [random_page, random_page]
+    random_page = 'https://en.wikipedia.org/wiki/Special:Random'
+    if len(sys.argv) > 1 and sys.argv[1].lower() == '--mlx':
+        if len(sys.argv) < 3:
+            pages = [random_page, random_page]
+        else:
+            pages = sys.argv[2:]
+        process_mlx(pages)
     else:
-        pages = sys.argv[1:]
-    process(pages)
+        if len(sys.argv) < 2:
+            pages = [random_page, random_page]
+        else:
+            pages = sys.argv[1:]
+        process(pages)
diff --git a/requirements-mlx.txt b/requirements-mlx.txt
@@ -0,0 +1,3 @@
+requests
+mlx
+mlx-lm
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+requests
diff --git a/test.py → test_all.py b/test.py → test_all.py